## Pipeline settings

In [1]:
is_cartisian_coords = False
is_covariance_reduction = False

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import rasterio
from rasterio.plot import show
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore

# loading training data    
data = np.load('species_train.npz')
train_locs = data['train_locs']
train_ids = data['train_ids']
species = data['taxon_ids']
species_names = dict(zip(data['taxon_ids'], data['taxon_names']))

data_test = np.load('species_test.npz', allow_pickle=True)
test_locs = data_test['test_locs']
test_label_list = data_test['test_pos_inds']
# data_test['test_pos_inds'] is a list of lists, where each list corresponds to 
# the indices in test_locs where a given species is present, it can be assumed 
# that they are not present in the other locations 

# form labels array for test data
labels = np.zeros(len(test_locs))
for species_id, locations in enumerate(test_label_list):
    taxon_id = species[species_id]
    for loc in locations:
        labels[loc] = taxon_id

df = pd.DataFrame(train_locs, columns=['x', 'y'])
scalar = StandardScaler()
df_scaled = scalar.fit_transform(df)
kmeans = KMeans(n_clusters=48, random_state=42)
df['cluster'] = kmeans.fit_predict(df_scaled)

  super()._check_params_vs_input(X, default_n_init=10)


In [3]:
from sklearn.preprocessing import PolynomialFeatures

bio1_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_1_train.npy')
bio1_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_1_test.npy')
bio2_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_2_train.npy')
bio2_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_2_test.npy')
bio3_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_3_train.npy')
bio3_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_3_test.npy')
bio4_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_4_train.npy')
bio4_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_4_test.npy')
bio5_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_5_train.npy')
bio5_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_5_test.npy')
bio6_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_6_train.npy')
bio6_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_6_test.npy')
bio7_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_7_train.npy')
bio7_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_7_test.npy')
bio8_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_8_train.npy')
bio8_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_8_test.npy')
bio9_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_9_train.npy')
bio9_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_9_test.npy')
bio10_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_10_train.npy')
bio10_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_10_test.npy')
bio11_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_11_train.npy')
bio11_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_11_test.npy')
bio12_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_12_train.npy')
bio12_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_12_test.npy')
bio13_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_13_train.npy')
bio13_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_13_test.npy')
bio14_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_14_train.npy')
bio14_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_14_test.npy')
bio15_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_15_train.npy')
bio15_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_15_test.npy')
bio16_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_16_train.npy')
bio16_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_16_test.npy')
bio17_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_17_train.npy')
bio17_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_17_test.npy')
bio18_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_18_train.npy')
bio18_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_18_test.npy')
bio19_train = np.load('wc2.1_10m_bio/wc2.1_10m_bio_19_train.npy')
bio19_test = np.load('wc2.1_10m_bio/wc2.1_10m_bio_19_test.npy')

def gaussian_normalization(data):
    # Replace NaNs with 0
    data = np.nan_to_num(data, nan=-1)
    
    # Calculate mean and std
    mean = np.mean(data)
    std = np.std(data)
    
    # Replace extreme values (e.g., values beyond 3 standard deviations from the mean) with -1
    extreme_value_mask = np.abs(data - mean) > 3 * std
    data[extreme_value_mask] = -1
    
    # Recalculate mean and std after replacing extreme values
    mean = np.mean(data)
    std = np.std(data)
    
    return (data - mean) / std

# Print values before and after normalization for debugging
def debug_normalization(data):
    print(f"Original: {data}")
    normalized_data = gaussian_normalization(data)
    print(f"Normalized: {normalized_data}")
    return normalized_data

bio1_train = gaussian_normalization(bio1_train)
bio1_test = gaussian_normalization(bio1_test)
bio2_train = gaussian_normalization(bio2_train)
bio2_test = gaussian_normalization(bio2_test)
bio3_train = gaussian_normalization(bio3_train)
bio3_test = gaussian_normalization(bio3_test)
bio4_train = gaussian_normalization(bio4_train)
bio4_test = gaussian_normalization(bio4_test)
bio5_train = gaussian_normalization(bio5_train)
bio5_test = gaussian_normalization(bio5_test)
bio6_train = gaussian_normalization(bio6_train)
bio6_test = gaussian_normalization(bio6_test)
bio7_train = gaussian_normalization(bio7_train)
bio7_test = gaussian_normalization(bio7_test)
bio8_train = gaussian_normalization(bio8_train)
bio8_test = gaussian_normalization(bio8_test)
bio9_train = gaussian_normalization(bio9_train)
bio9_test = gaussian_normalization(bio9_test)
bio10_train = gaussian_normalization(bio10_train)
bio10_test = gaussian_normalization(bio10_test)
bio11_train = gaussian_normalization(bio11_train)
bio11_test = gaussian_normalization(bio11_test)
bio12_train = gaussian_normalization(bio12_train)
bio12_test = gaussian_normalization(bio12_test)
bio13_train = gaussian_normalization(bio13_train)
bio13_test = gaussian_normalization(bio13_test)
bio14_train = gaussian_normalization(bio14_train)
bio14_test = gaussian_normalization(bio14_test)
bio15_train = gaussian_normalization(bio15_train)
bio15_test = gaussian_normalization(bio15_test)
bio16_train = gaussian_normalization(bio16_train)
bio16_test = gaussian_normalization(bio16_test)
bio17_train = gaussian_normalization(bio17_train)
bio17_test = gaussian_normalization(bio17_test)
bio18_train = gaussian_normalization(bio18_train)
bio18_test = gaussian_normalization(bio18_test)
bio19_train = gaussian_normalization(bio19_train)
bio19_test = gaussian_normalization(bio19_test)

# calculate cartisian coordinates

def cartisian_coords(locs):
    coords = np.zeros((locs.shape[0], 2))
    for i in range(locs.shape[0]):
        coords[i, 0] = locs[i, 1] * np.cos(locs[i, 0])
        coords[i, 1] = locs[i, 1] * np.sin(locs[i, 0])
    return coords

if is_cartisian_coords:
    train_locs = cartisian_coords(train_locs)
    test_locs = cartisian_coords(test_locs)

# combine train_locs with bio data to form the feature metrix
bio1_train = bio1_train.reshape(-1, 1)
bio2_train = bio2_train.reshape(-1, 1)
bio3_train = bio3_train.reshape(-1, 1)
bio4_train = bio4_train.reshape(-1, 1)
bio5_train = bio5_train.reshape(-1, 1)
bio6_train = bio6_train.reshape(-1, 1)
bio7_train = bio7_train.reshape(-1, 1)
bio8_train = bio8_train.reshape(-1, 1)
bio9_train = bio9_train.reshape(-1, 1)
bio10_train = bio10_train.reshape(-1, 1)
bio11_train = bio11_train.reshape(-1, 1)
bio12_train = bio12_train.reshape(-1, 1)
bio13_train = bio13_train.reshape(-1, 1)
bio14_train = bio14_train.reshape(-1, 1)
bio15_train = bio15_train.reshape(-1, 1)
bio16_train = bio16_train.reshape(-1, 1)
bio17_train = bio17_train.reshape(-1, 1)
bio18_train = bio18_train.reshape(-1, 1)
bio19_train = bio19_train.reshape(-1, 1)
bio1_test = bio1_test.reshape(-1, 1)
bio2_test = bio2_test.reshape(-1, 1)
bio3_test = bio3_test.reshape(-1, 1)
bio4_test = bio4_test.reshape(-1, 1)
bio5_test = bio5_test.reshape(-1, 1)
bio6_test = bio6_test.reshape(-1, 1)
bio7_test = bio7_test.reshape(-1, 1)
bio8_test = bio8_test.reshape(-1, 1)
bio9_test = bio9_test.reshape(-1, 1)
bio10_test = bio10_test.reshape(-1, 1)
bio11_test = bio11_test.reshape(-1, 1)
bio12_test = bio12_test.reshape(-1, 1)
bio13_test = bio13_test.reshape(-1, 1)
bio14_test = bio14_test.reshape(-1, 1)
bio15_test = bio15_test.reshape(-1, 1)
bio16_test = bio16_test.reshape(-1, 1)
bio17_test = bio17_test.reshape(-1, 1)
bio18_test = bio18_test.reshape(-1, 1)
bio19_test = bio19_test.reshape(-1, 1)

bio_vars = np.concatenate((bio1_train, bio2_train, bio3_train, bio4_train, bio5_train, bio6_train, bio7_train, bio8_train, bio9_train, bio10_train, bio11_train, bio12_train, bio13_train, bio14_train, bio15_train, bio16_train, bio17_train, bio18_train, bio19_train), axis=1)
bio_vars_test = np.concatenate((bio1_test, bio2_test, bio3_test, bio4_test, bio5_test, bio6_test, bio7_test, bio8_test, bio9_test, bio10_test, bio11_test, bio12_test, bio13_test, bio14_test, bio15_test, bio16_test, bio17_test, bio18_test, bio19_test), axis=1)

if is_covariance_reduction:
    poly = PolynomialFeatures(2)
    bio_vars = poly.fit_transform(bio_vars)
    train_locs = np.concatenate((train_locs, bio_vars), axis=1)
    cor_matrix = pd.DataFrame(train_locs).corr().abs()
    upper = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    train_locs_reduced = np.delete(train_locs, to_drop, axis=1)
    
    bio_vars_test = poly.transform(bio_vars_test)
    cor_matrix = pd.DataFrame(test_locs).corr().abs()
    upper = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    test_locs_reduced = np.delete(test_locs, to_drop, axis=1)
else:
    train_locs_reduced = np.concatenate((train_locs, bio_vars), axis=1)
    test_locs_reduced = np.concatenate((test_locs, bio_vars_test), axis=1)

# should put it before normalization
z_scores_train = np.abs(zscore(train_locs_reduced))
train_outliers = np.where(z_scores_train > 3)
train_locs_reduced = np.delete(train_locs_reduced, train_outliers[0], axis=0)
train_ids = np.delete(train_ids, train_outliers[0],axis=0)

# label_encoder = LabelEncoder()

# # arbitraryly set one label to 0 for negative data.
# train_ids[2431] = 0
# encoded_train_ids = label_encoder.fit_transform(train_ids)

# train_ids_matrix = np.zeros((len(encoded_train_ids), 501))
# train_ids_matrix[np.arange(len(encoded_train_ids)), encoded_train_ids] = 1

# # construct labels matrix
# encoded_test_ids = label_encoder.transform(labels)
# labels_matrix = np.zeros((len(encoded_test_ids), 501))
# labels_matrix[np.arange(len(encoded_test_ids)), encoded_test_ids] = 1

# new_train_locs = np.concatenate((train_locs, bio1_train, bio2_train, bio3_train, bio4_train, bio5_train, bio6_train, bio7_train, bio8_train, bio9_train, bio10_train, bio11_train, bio12_train, bio13_train, bio14_train, bio15_train, bio16_train, bio17_train, bio18_train, bio19_train), axis=1)
# new_train_locs = np.concatenate((new_train_locs, df['cluster'].values.reshape(-1, 1)), axis=1)

## Get the worlclim data of a given location

In [12]:
worldclim = rasterio.open('wc2.1_10m_bio/wc2.1_10m_bio_2.tif')
lon = 73.848976
lat = 16.877283
coords = [(lon, lat)]
transformed_coords = [worldclim.index(lon, lat) for lon, lat in coords]
data = worldclim.read(1)
value = data[transformed_coords[0][0], transformed_coords[0][1]]

print(value) 

10.597021


## ML Models

In [7]:
y_binary = (train_ids == 3117).astype(int)
num_positive = np.sum(y_binary)
num_negative = len(y_binary) - num_positive
print(f"Number of positive samples: {num_positive}, number of negative samples: {num_negative}")


Number of positive samples: 1123, number of negative samples: 252176


In [13]:
# random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20, 30],
#     'max_features': ['sqrt', 'log2', None]
# }

# rf = RandomForestClassifier()
# model = GridSearchCV(rf, param_grid, cv=5, scoring='f1_micro', n_jobs=-1)
# model.fit(train_locs, train_ids)

model = RandomForestClassifier(n_estimators=100)
model.fit(train_locs_reduced, train_ids)
pred = model.predict(test_locs_reduced)
print(f1_score(labels, pred, average='micro'), ", micro")
print(f1_score(labels, pred, average='macro'), ", macro")
# n=40, f1 = 0.37, single pred, coords only
# n=100, f1 = , single pred, coords + bio vars gaussian normal, cor > 0.95 removed, kmeans cluster 24


ValueError: Expected 2D array, got 1D array instead:
array=[ 31529.  31529.  31529. ... 145031. 145031. 145031.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [10]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
print(f1_score(labels, pred, average='micro'), ", micro")
print(f1_score(labels, pred, average='macro'), ", macro")
print("precision", precision_score(labels, pred, average='micro'), ", micro")
print("precision", precision_score(labels, pred, average='macro'), ", macro")
print("recall", recall_score(labels, pred, average='micro'), ", micro")
print("recall", recall_score(labels, pred, average='macro'), ", macro")
print(accuracy_score(labels, pred), ", accuracy")

0.009225258744559596 , micro
0.004416482045455576 , macro
precision 0.009225258744559596 , micro
precision 0.005410190096162402 , macro
recall 0.009225258744559596 , micro
recall 0.012831135933982968 , macro
0.009225258744559596 , accuracy


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# use naive bayes
from sklearn.naive_bayes import GaussianNB
model = GaussianNB(var_smoothing=0.2)
model.fit(train_locs_reduced, train_ids)
pred = model.predict(test_locs_reduced)
f1 = f1_score(labels, pred, average='micro')
print(f1)
# 0.099, single pred, coords only
# 0.007, single pred, coords + bio datapp
# 0.157, single pred, coords + bio data, z-score normalization

0.01037060689569002


In [None]:
from sklearn.svm import LinearSVC
model = LinearSVC(multi_class='ovr')
model.fit(train_locs_reduced, train_ids)
pred = model.predict(test_locs_reduced)
f1 = f1_score(labels, pred, average='micro')
print(f1)



In [39]:
# use KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaled_train_locs = scaler.fit_transform(train_locs)
# scaled_val_locs = scaler.transform(val_locs)
model = KNeighborsClassifier(n_neighbors=20, leaf_size=20, algorithm='auto')
model.fit(train_locs_reduced, train_ids)
pred = model.predict(test_locs_reduced)
f1 = f1_score(labels, pred, average='micro')
print(f1)
# n=3, f1 = 0.33, single pred, coords only
# n=3, f1 = 0.33, single pred, coords + bio data
# n=20, leaf_size=20, f1 = 0.383, single pred, coords + bio data, z-score normalization

0.06873477207571793


[ 4636  4636  4636 ... 20504 20504 20504]


In [11]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(train_locs, train_ids)
pred = model.predict(val_locs)
f1 = f1_score(val_ids, pred, average='micro')
print(f1)

0.3463645052198206


In [12]:
# gradient boosting
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgb = XGBClassifier()
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7]
}

model = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='f1_micro')
model.fit(train_locs, train_ids)
print(model.best_params_)

ValueError: 
All the 135 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\zhang\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\zhang\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "c:\Users\zhang\AppData\Local\Programs\Python\Python312\Lib\site-packages\xgboost\sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
 486 487 488 489 490 491 492 493 494 495 496 497 498 499], got [    458     460     487     585     871     880     890    1026    1078
    1224    1264    1630    1692    1850    2071    2203    2229    2264
    3045    3069    3117    3340    3544    3623    3734    3813    3914
    4146    4191    4208    4309    4345    4535    4569    4636    4661
    4665    4729    4765    4793    4805    5150    5165    5174    5231
    5235    5261    5365    5367    5376    5401    5430    5532    5612
    5920    5947    6023    6228    6350    6364    6969    6976    7058
    7114    7178    7244    7384    7497    7562    7576    7703    7729
    7730    7802    7806    7868    7920    8076    8077    8079    8217
    8269    8277    8399    8446    8454    8520    8705    8847    8909
    8912    9081    9083    9152    9296    9351    9477    9523    9549
    9602    9612    9626    9639    9771    9832   10079   10090   10168
   10243   10373   10506   10585   10719   10834   11114   11586   11896
   11970   11972   12024   12038   12048   12063   12237   12305   12329
   12443   12526   12641   12707   12716   12735   12746   12764   12781
   12792   12821   12832   12839   12910   12953   13075   13091   13092
   13171   13272   13392   13456   13470   13471   13494   13510   13522
   13532   13540   13632   13709   13820   13851   13920   14012   14030
   14069   14104   14115   14167   14169   14172   14306   14492   14610
   14635   14658   14881   14891   14926   14978   15035   15256   15302
   16006   16079   16623   16765   16782   16972   17009   17050   17076
   17174   17328   17634   18204   18206   18513   19050   19055   19182
   19262   19568   19765   19975   20012   20504   20535   20553   22038
   22084   22973   23027   23455   23570   23611   23968   24266   24379
   24391   24462   24501   24832   24994   25003   25094   25383   25411
   25603   25697   25806   25861   26159   26527   26555   26691   26699
   26745   26799   26882   26916   27097   27123   27163   27225   27236
   27409   27413   27431   27500   27696   27726   27818   27844   27919
   28948   29073   29084   29273   29310   29351   29508   29536   29550
   29764   29808   29976   30268   30285   30358   30395   30487   30822
   30929   30953   31150   31226   31236   31244   31299   31419   31529
   31866   32055   32093   32861   32864   33002   33142   33155   33297
   34141   34154   35046   35161   35738   35854   35990   36107   36149
   36281   36315   36316   37213   37431   37465   37721   37920   38008
   38992   40281   40323   40523   40908   41090   41124   41147   41301
   41387   41774   41824   41886   41939   42056   42087   42122   42138
   42220   42223   42228   42328   42332   42336   42380   42657   42715
   42722   42888   42961   43047   43115   43138   43145   43236   43353
   43375   43448   43455   43523   43530   43567   43606   43712   43759
   43840   43848   44104   44570   45932   46014   46116   46180   46217
   46296   46471   46994   47049   47062   47107   54549   59815   60348
   62797   64387   64970   64973   64985   65212   65373   65864   65987
   66020   67105   67185   67188   67819   68339   69696   70725   72667
   72731   73106   73172   73224   73475   73801   73849   73903   74204
   74321   74443   93486   96386   98781  111451  113754  116798  116868
  116872  116950  117054  117079  117097  117180  144243  144298  144460
  144468  144469  144531  144646  144659  144743  144761  144764  144768
  144795  144797  144815  145031  145085  145187  145189  145300  145308
  145310  145338  180001  180008  199841  201113  201178  204449  204473
  204497  204523  204544  225741  225751  318695  318747  326305  339674
  339693  343021  367535  367550  367693  370402  423656  438022  472766
  472770  472799  476523  508972  508981  512092  512762  517047  517053
  518169  528911  539892  550880  556645  558436  558437  558618  558619
  559131  560425  604225  607212  717903  827392  855311  979665  979677
  979682  980019  980037 1067750 1289422 1289423 1289480 1289491 1289509
 1289606 1289647 1289681 1289689 1368519]


In [9]:
# multiclass logistic regression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000, multi_class='multinomial')
model.fit(train_locs, train_ids)
pred = model.predict(val_locs)
f1 = f1_score(val_ids, pred, average='micro')
print(f1)

KeyboardInterrupt: 

In [None]:
# use SVM
from sklearn.svm import SVC

# randomly discard 90% of data
train_locs = train_locs[::10]
train_ids = train_ids[::10]

model = SVC(kernel='linear')
model.fit(train_locs, train_ids)
pred = model.predict(val_locs)
f1 = f1_score(val_ids, pred, average='micro')
print(f1)

In [7]:
from sklearn.metrics import f1_score
f1 = f1_score(val_ids, pred, average='weighted')
print('F1 score:', f1)

F1 score: 0.3684784908003791
