In [1]:
%reload_ext autoreload
%autoreload 2
import ast
import pickle
import itertools
from collections import Counter
from tqdm import tqdm
import pandas as pd
import os

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import cv2
import swifter

from analysis.generate_cluster_information_file import load, extract_all_information_query, to_df_query
from baseline.image_processing import pixel_intensity_histogram


from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
train_X = pd.read_csv('train_X.csv')
train_y = pd.read_csv('train_y.csv')

In [7]:
train_X.head()

Unnamed: 0,cluster_num,cluster_num_intensities_avg,cluster_peak_intensities_avg,cluster_x_avg,cluster_y_avg
0,50.0,6.68,14.92,513.46,475.32
1,104.0,5.692308,11.375,536.201923,473.798077
2,71.0,5.605634,11.56338,464.971831,513.746479
3,49.0,8.428571,22.612245,417.285714,564.571429
4,169.0,3.284024,6.621302,462.224852,509.230769


In [3]:
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
train_y_scaled = scaler.fit_transform(train_y)
train_X_na = train_X.fillna(0)
train_X_na_scaled = scaler.fit_transform(train_X_na)

### Tuning XGBoost

In [6]:
"""
    Tunes hyperparameters for xgbosot
"""
hyperparameters = { 
    'n_estimators': [100, 500, 1000, 5000],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 0.7],
    'max_depth': [3, 6, 8],
    'booster': ['gbtree', 'gblinear'],
    'subsample': [0.5, 0.7, 1],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.5]
}

clf = RandomizedSearchCV(xgboost.XGBRegressor(objective='reg:squarederror'), 
                         hyperparameters, scoring=make_scorer(mean_absolute_error),
                         random_state=1, n_iter=100, cv=10, verbose=1, n_jobs=-1)
clf.fit(train_X_scaled, train_y)
clf.__dict__

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed: 29.5min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 35.4min finished


{'param_distributions': {'n_estimators': [100, 500, 1000, 5000],
  'learning_rate': [0.01, 0.05, 0.1, 0.5, 0.7],
  'max_depth': [3, 6, 8],
  'booster': ['gbtree', 'gblinear'],
  'subsample': [0.5, 0.7, 1],
  'colsample_bytree': [0.5, 0.7, 1.0],
  'gamma': [0, 0.1, 0.5]},
 'n_iter': 100,
 'random_state': 1,
 'scoring': None,
 'estimator': XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              objective='reg:squarederror', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, validate_parameters=False, verbosity=None

In [18]:
print(clf.best_estimator_)
print('best score: ', clf.best_score_)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, gamma=0.5, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.01, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.7, tree_method=None,
             validate_parameters=False, verbosity=None)
best score:  0.4477113293611311


In [None]:
"""
    Tunes hyperparameters for RandomForest
"""
hyperparameters = { 
    'n_estimators': [100, 500, 1000, 5000],
    'criterion' : ['mse', 'mae'],
    'max_depth': [3, 6, 8, 20, None],
    'min_samples_split' : [2, 4, 8, 16],
    'min_samples_leaf' : [1, 2, 4],
    'max_features' : ['auto', 'sqrt', 'log2']
}

randomforest = RandomForestRegressor()

clfRF = RandomizedSearchCV(randomforest, 
                         hyperparameters, 
                         random_state=1, n_iter=100, cv=10, verbose=2, n_jobs=-1)
clfRF.fit(train_X_na_scaled, np.ravel(train_y_scaled))
clfRF.__dict__

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  6.4min


In [6]:
randomforest = RandomForestRegressor()
randomforest.fit(train_X_na_scaled, np.ravel(train_y_scaled))

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)