In [1]:
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from Utils.Dataloader import ML_dataloader
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score
import time
from datetime import datetime
%load_ext autoreload
%autoreload 2
RANDOM_SEED = 1234
from Utils.Helper import train_mission

In [2]:
train_mission_list = []
idx = 0
for type in ['Both','S1', 'L8', ]:
  for site in [1, 3, 5, 7, 9]:
    mission = train_mission(train_site=[site, ], test_site=[site, ] , idx= idx, type=type, model_name='RF')
    train_mission_list.append(mission)
    idx += 1

In [3]:
for mission in train_mission_list:
  if mission.index <= 1:
    continue

  mission.mission_start()
  X_train, Y_train = ML_dataloader(sites=mission.train_site, years=mission.train_years, type=mission.type, verbose=True)
  Y_train = Y_train.squeeze()
  X_test, Y_test = ML_dataloader(sites=mission.test_site, years=mission.test_years, type=mission.type, verbose=True)
  Y_test = Y_test.squeeze()
  rf = RandomForestClassifier(n_estimators=1200, max_depth=24, random_state=RANDOM_SEED, n_jobs= 14)
  rf.fit(X_train, Y_train)
  X_predict_test = rf.predict(X_test)
  X_predict_train = rf.predict(X_train)
  mission.mission_get_score(Y_train, X_predict_train, Y_test, X_predict_test)

2022-04-29 13:48:10 Index:2, Model: RF, train_site: 5, test_site: 5, type: Both
Loaded 67 images in site 5, year 2017
Loaded 67 images in site 5, year 2018
Loaded 67 images in site 5, year 2019
Completed! Loaded 201 images in total.
Loaded 67 images in site 5, year 2020
Completed! Loaded 67 images in total.
2022-04-29 20:57:28 Index:3, Model: RF, train_site: 7, test_site: 7, type: Both
Loaded 42 images in site 7, year 2017
Loaded 19 images in site 7, year 2018
Loaded 42 images in site 7, year 2019
Completed! Loaded 103 images in total.
Loaded 42 images in site 7, year 2020
Completed! Loaded 42 images in total.
2022-04-30 00:14:59 Index:4, Model: RF, train_site: 9, test_site: 9, type: Both
Loaded 42 images in site 9, year 2017
Loaded 42 images in site 9, year 2018
Loaded 42 images in site 9, year 2019
Completed! Loaded 126 images in total.
Loaded 42 images in site 9, year 2020
Completed! Loaded 42 images in total.


In [None]:
TRAINING_DICT = {"years":[2017, 2018, 2019,], "sites":[3,], "type":"Both"}
TESTING_DICT = {"years":[2020,], "sites":[3,], "type":"Both"}

In [None]:
# Parameters = {"n_estimators":[1200, 2000], "max_depth":[36, None]}


In [None]:
X_train, Y_train = ML_dataloader(sites=TRAINING_DICT["sites"], years=TRAINING_DICT["years"], type=TRAINING_DICT["type"], verbose=True)
Y_train = Y_train.squeeze()
X_test, Y_test = ML_dataloader(sites=TESTING_DICT["sites"], years=TESTING_DICT["years"], type=TESTING_DICT["type"], verbose=True)
Y_test = Y_test.squeeze()

# assert(np.any(np.isnan(X_train)) == False and np.any(np.isnan(Y_train)) == False)

Loaded 49 images in site 3, year 2017
Loaded 49 images in site 3, year 2018
Loaded 49 images in site 3, year 2019
Completed! Loaded 147 images in total.
Loaded 49 images in site 3, year 2020
Completed! Loaded 49 images in total.


In [None]:
rf = RandomForestClassifier(n_estimators=1200, max_depth=36, random_state=RANDOM_SEED, n_jobs= 14)
rf.fit(X_train, Y_train)
X_predict = rf.predict(X_test)

In [None]:
acc = accuracy_score(Y_test, X_predict)
f1 = f1_score(Y_test, X_predict)
coppa = cohen_kappa_score(Y_test, X_predict)
print("acc: {:.4f}, f1: {:.4f}, coppa: {:.4f}".format(acc, f1, coppa))

acc: 0.8666, f1: 0.8364, coppa: 0.7243


In [None]:
best_param = None
best_score = 0.8452251565867457
for n_estimator in Parameters["n_estimators"]:
  for max_depth in  Parameters["max_depth"]:
      rf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, random_state=RANDOM_SEED, n_jobs= 14)
      score = cross_val_score(rf, X_train, Y_train, cv=5)
      mean_score = np.mean(score)
      curr_time = datetime.now()

      curr_time_str = curr_time.strftime("%d/%m/%Y %H:%M")
      print("{t}:   n estimator: {n}, max depth: {d}, score: {s} ".format(n= n_estimator, d=max_depth, s=mean_score, t=curr_time_str))
      f = open("./Logs/rf_tuning_log.txt","a")
      f.write("{t}:   n estimator: {n}, max depth: {d}, score: {s}, mean_score: {m}\n".format(n= n_estimator, d=max_depth, s=score, t=curr_time_str, m = mean_score))
      f.close()
      if mean_score > best_score:
        print("======Update best score!======")
        best_param = (n_estimator, max_depth)
        best_score = mean_score
print(best_param)
print(best_score)





18/04/2022 23:14:   n estimator: 1200, max depth: 36, score: 0.8451846418709591 
19/04/2022 18:51:   n estimator: 1200, max depth: None, score: 0.8452452560950968 
21/04/2022 01:24:   n estimator: 2000, max depth: 36, score: 0.8452888225686961 




22/04/2022 08:08:   n estimator: 2000, max depth: None, score: 0.8453551193763469 
(2000, None)
0.8453551193763469
