In [2]:
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import GammaRegressor, TweedieRegressor, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import xgboost as xgb

In [3]:
x = pd.read_csv('./data/x_value_train.csv')
x

Unnamed: 0,0pIC50,1pIC50,2pIC50,3pIC50,4pIC50,5pIC50,6pIC50,7pIC50,8pIC50,9pIC50,...,1014pIC50,1015pIC50,1016pIC50,1017pIC50,1018pIC50,1019pIC50,1020pIC50,1021pIC50,1022pIC50,1023pIC50
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
464,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
465,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
466,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [4]:
y = pd.read_csv('./data/y_value_train.csv')
y

Unnamed: 0,Standard Value
0,9.154902
1,5.000000
2,6.625252
3,5.000000
4,8.455932
...,...
463,4.100000
464,7.619789
465,5.313185
466,8.585027


In [5]:
def model_optimization(x_dataset, y_dataset, estimators):

    start = time.time()
    if estimators in ['GammaRegressor', 'TweedieRegressor', 'BayesianRidge', 'RandomForestRegressor', 'LGBMRegressor']:

        x_data = x_dataset.values
        y_data = y_dataset.values
        y_data = y_data.flatten()
    
    elif estimators in ['XGBRegressor']:
        x_data = x_dataset.values
        y_data = y_dataset.values
    
    else:
        print('Please specify the model in the list')

    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state = 64)

    if estimators == 'GammaRegressor':
        model = GammaRegressor()
        param_grid = {
            'alpha': [0.01, 0.1, 1.0, 10.0, 100.0],
            'max_iter': [200, 1000, 2000, 3000],
            'tol': [1e-4, 1e-3, 1e-2]
            }
    
    elif estimators == 'TweedieRegressor':
        model = TweedieRegressor(link='log')
        param_grid = {
            'power': [1, 1.5, 2, 2.5, 3],
            'alpha': [0.01, 0.1, 1.0, 10.0],
            'max_iter': [1000, 2000, 3000],
            'tol': [1e-4, 1e-3, 1e-2]
        }
    
    elif estimators == 'BayesianRidge':
        model = BayesianRidge()
        param_grid = {
            'alpha_1': [1e-6, 1e-5, 1e-4],
            'alpha_2': [1e-6, 1e-5, 1e-4],
            'lambda_1': [1e-6, 1e-5, 1e-4],
            'lambda_2': [1e-6, 1e-5, 1e-4],
            'max_iter': [300, 1000, 2000, 3000],
            'tol': [1e-4, 1e-3, 1e-2]
        }

    
    elif estimators == 'XGBRegressor':
        model = XGBRegressor()
        param_grid = {
            'n_estimators': [50, 100, 200, 500],
            'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
            'max_depth': [2, 3, 5, 10]
        }
    
    elif estimators =='RandomForestRegressor':
        model = RandomForestRegressor(random_state = 64)
        param_grid = {
            'bootstrap': [True, False],
            'max_features': ['log2','sqrt', None],
            'min_samples_split': [2,3,4,5,6,7,8,9]
        }
    
    elif estimators == 'LGBMRegressor':
        model = LGBMRegressor(random_state = 64)
        param_grid = {
            'learning_rate': [0.01, 0.015, 0.025, 0.05, 0.1],
        }
    


    regression_grid = GridSearchCV(model, param_grid = param_grid, scoring = 'r2')
    regression_grid.fit(X = x_train, y = y_train)

    best_estimator = regression_grid.best_estimator_
    best_score = regression_grid.best_score_
    best_param = regression_grid.best_params_

    print('The best hyperparameters:', best_estimator)
    print('The best score:', best_score)
    print('The best param:', best_param)
    print('Time for optimization: %f seconds' %(time.time()-start), flush=True)
    print('*********************************')

    return best_estimator



In [6]:
GammaRegressor = model_optimization(x_dataset = x, y_dataset = y, estimators = 'GammaRegressor')
TweedieRegressor = model_optimization(x_dataset = x, y_dataset = y, estimators = 'TweedieRegressor')
BayesianRidge = model_optimization(x_dataset=x, y_dataset=y, estimators='BayesianRidge')
RandomForestRegressor = model_optimization(x_dataset=x, y_dataset=y, estimators='RandomForestRegressor')
LGBMRegressor = model_optimization(x_dataset = x, y_dataset = y, estimators='LGBMRegressor')

The best hyperparameters: GammaRegressor(alpha=0.1, max_iter=200)
The best score: 0.6502610741960144
The best param: {'alpha': 0.1, 'max_iter': 200, 'tol': 0.0001}
Time for optimization: 4.514819 seconds
*********************************
The best hyperparameters: TweedieRegressor(alpha=0.1, link='log', max_iter=1000, power=1.5)
The best score: 0.6609383483338227
The best param: {'alpha': 0.1, 'max_iter': 1000, 'power': 1.5, 'tol': 0.0001}
Time for optimization: 15.326348 seconds
*********************************
The best hyperparameters: BayesianRidge(alpha_2=0.0001, lambda_1=0.0001, max_iter=300, tol=0.0001)
The best score: 0.6649245926581185
The best param: {'alpha_1': 1e-06, 'alpha_2': 0.0001, 'lambda_1': 0.0001, 'lambda_2': 1e-06, 'max_iter': 300, 'tol': 0.0001}
Time for optimization: 301.308041 seconds
*********************************
The best hyperparameters: RandomForestRegressor(bootstrap=False, max_features='sqrt', min_samples_split=3,
                      random_state=64)
T

In [7]:
XGBRegressor = model_optimization(x_dataset=x, y_dataset=y, estimators='XGBRegressor')

The best hyperparameters: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=5, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=100, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
The best score: 0.6820488897375878
The best param: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Time for optimization: 102.796284 seconds
*********************************
