In [82]:
import pandas as pd
import pickle
import glob
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV,ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

#### model train/test set preparation

In [7]:
experiment_base_dir = '/media/big-ssd/experiments'
experiment_name = 'P3856'
feature_detection_method = 'pasef'
run_name = 'P3856_YHE211_1_Slot1-1_1_5104'

In [8]:
EXPERIMENT_DIR = "{}/{}".format(experiment_base_dir, experiment_name)
IDENTIFICATIONS_DIR = '{}/identifications-{}'.format(EXPERIMENT_DIR, feature_detection_method)
IDENTIFICATIONS_FILE = '{}/exp-{}-identifications-{}.pkl'.format(IDENTIFICATIONS_DIR, experiment_name, feature_detection_method)


In [9]:
MAXIMUM_Q_VALUE_FOR_RECAL_TRAINING_SET = 0.1

In [10]:
# load the identifications to use for the training set
with open(IDENTIFICATIONS_FILE, 'rb') as handle:
    idents_df = pickle.load(handle)['identifications_df']
idents_df = idents_df[(idents_df['percolator q-value'] <= MAXIMUM_Q_VALUE_FOR_RECAL_TRAINING_SET)]
idents_df = idents_df[(idents_df.run_name == run_name)]
print('loaded {} identifications with q-value lower than {} from {}'.format(len(idents_df), MAXIMUM_Q_VALUE_FOR_RECAL_TRAINING_SET, IDENTIFICATIONS_FILE))

# load the features for recalibration
FEATURES_DIR = '{}/features-{}'.format(EXPERIMENT_DIR, feature_detection_method)
feature_files = glob.glob("{}/exp-{}-run-*-features-{}-dedup.pkl".format(FEATURES_DIR, experiment_name, feature_detection_method))
features_l = []
for f in feature_files:
    with open(f, 'rb') as handle:
        features_l.append(pickle.load(handle)['features_df'])
features_df = pd.concat(features_l, axis=0, sort=False)
features_df = features_df[(features_df.run_name == run_name)]
print('loaded {} features from {} files for recalibration'.format(len(features_df), len(feature_files)))


loaded 14235 identifications with q-value lower than 0.1 from /media/big-ssd/experiments/P3856/identifications-pasef/exp-P3856-identifications-pasef.pkl
loaded 65370 features from 1 files for recalibration


In [11]:
X = idents_df[['mono_mz_without_saturated_points','scan_apex','rt_apex','feature_intensity']].to_numpy()
y = idents_df[['mass_error']].to_numpy()[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)


In [12]:
idents_df.isnull().values.any()

False

In [13]:
len(X_train), len(X_test)

(12811, 1424)

#### grid search

In [144]:
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.05, 
                                                              min_samples_split=500,
                                                              min_samples_leaf=50,
                                                              max_depth=8,
                                                              max_features='sqrt',
                                                              subsample=0.8,
                                                              random_state=10), 
param_grid = param_test1, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5)

In [145]:
gsearch1.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=GradientBoostingRegressor(learning_rate=0.05,
                                                 max_depth=8,
                                                 max_features='sqrt',
                                                 min_samples_leaf=50,
                                                 min_samples_split=500,
                                                 random_state=10,
                                                 subsample=0.8),
             n_jobs=-1, param_grid={'n_estimators': range(20, 81, 10)},
             scoring='neg_mean_absolute_error')

In [146]:
pd.DataFrame(gsearch1.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.295492,0.002927,0.002451,2.3e-05,20,{'n_estimators': 20},-0.004521,-0.004683,-0.004872,-0.004688,-0.00468,-0.004689,0.000111,7
1,0.438993,0.003189,0.003305,5.8e-05,30,{'n_estimators': 30},-0.004478,-0.004627,-0.00483,-0.004648,-0.004648,-0.004646,0.000112,6
2,0.582859,0.007512,0.004133,8e-05,40,{'n_estimators': 40},-0.004451,-0.004604,-0.004816,-0.004628,-0.004639,-0.004628,0.000116,5
3,0.730795,0.004411,0.004854,6.1e-05,50,{'n_estimators': 50},-0.004444,-0.004592,-0.004811,-0.004619,-0.004628,-0.004619,0.000117,4
4,0.882536,0.003551,0.005578,6.2e-05,60,{'n_estimators': 60},-0.004443,-0.004587,-0.004809,-0.004614,-0.004625,-0.004616,0.000117,3
5,1.025048,0.018444,0.005974,0.000451,70,{'n_estimators': 70},-0.00444,-0.004579,-0.004805,-0.004615,-0.004622,-0.004612,0.000117,2
6,0.96476,0.090736,0.005568,4.7e-05,80,{'n_estimators': 80},-0.004444,-0.004577,-0.004806,-0.004613,-0.00462,-0.004612,0.000116,1


In [149]:
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 80}, -0.004611954045469387)

In [155]:
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(100,1001,100)}
gsearch2 = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.05, 
                                                              min_samples_leaf=50,
                                                              max_features='sqrt',
                                                              subsample=0.8,
                                                              random_state=10,
                                                              n_estimators=80), 
                        param_grid=param_test2, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5)
gsearch2.fit(X_train, y_train)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 7, 'min_samples_split': 200}, -0.0045999623225885)

In [158]:
param_test3 = {'min_samples_split':range(100,2100,100), 'min_samples_leaf':range(10,71,10)}
gsearch3 = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.05, 
                                                              max_features='sqrt',
                                                              subsample=0.8,
                                                              random_state=10,
                                                              n_estimators=80,
                                                           max_depth=7), 
                        param_grid=param_test3, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5)
gsearch3.fit(X_train, y_train)
gsearch3.best_params_, gsearch3.best_score_

({'min_samples_leaf': 30, 'min_samples_split': 200}, -0.004599189500370496)

In [160]:
param_test4 = {'max_features':[1,2,3,4]}
gsearch4 = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.05, 
                                                              subsample=0.8,
                                                              random_state=10,
                                                              n_estimators=80,
                                                           max_depth=7,
                                                           min_samples_leaf=30,
                                                           min_samples_split=200), 
                        param_grid=param_test4, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5)
gsearch4.fit(X_train, y_train)
gsearch4.best_params_, gsearch4.best_score_

({'max_features': 2}, -0.004599189500370496)

In [161]:
param_test5 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator=GradientBoostingRegressor(learning_rate=0.05, 
                                                              random_state=10,
                                                              n_estimators=80,
                                                           max_depth=7,
                                                           min_samples_leaf=30,
                                                           min_samples_split=200,
                                                           max_features=2), 
                        param_grid=param_test5, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5)
gsearch5.fit(X_train, y_train)
gsearch5.best_params_, gsearch5.best_score_

({'subsample': 0.8}, -0.004599189500370496)

In [162]:
param_test6 = {'learning_rate':[0.01,0.05,0.001,0.005]}
gsearch6 = GridSearchCV(estimator=GradientBoostingRegressor(random_state=10,
                                                              n_estimators=80,
                                                           max_depth=7,
                                                           min_samples_leaf=30,
                                                           min_samples_split=200,
                                                           max_features=2,
                                                           subsample=0.8), 
                        param_grid=param_test6, scoring='neg_mean_absolute_error', n_jobs=-1, cv=5)
gsearch6.fit(X_train, y_train)
gsearch6.best_params_, gsearch6.best_score_

({'learning_rate': 0.05}, -0.004599189500370496)

In [181]:
estimator=GradientBoostingRegressor(learning_rate=0.05,
                                random_state=10,
                                                              n_estimators=5000,
                                                           max_depth=7,
                                                           min_samples_leaf=30,
                                                           min_samples_split=200,
                                                           max_features=2,
                                                           subsample=0.8)
estimator.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.05, max_depth=7, max_features=2,
                          min_samples_leaf=30, min_samples_split=200,
                          n_estimators=5000, random_state=10, subsample=0.8)

In [180]:
print("R-squared for Train: %.2f" %estimator.score(X_train, y_train))
print("R-squared for Test: %.2f" %estimator.score(X_test, y_test))

R-squared for Train: 1.00
R-squared for Test: -0.33


#### randomised search

In [87]:
parameters = {
    "loss": ['ls','lad','huber'],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    'n_estimators': range(20,510,10),
    'max_depth':range(5,30,2), 
    'min_samples_split':range(100,1001,100),
    'subsample':list(np.arange(0.2,0.9,0.1)),
    'min_samples_leaf':range(10,71,10),
    'max_features':["log2", "sqrt"],
    }

In [88]:
gbr = GradientBoostingRegressor()

In [89]:
rsearch = RandomizedSearchCV(gbr, parameters, n_iter=100, n_jobs=-1, random_state=10, cv=5, scoring='neg_mean_absolute_error')
rsearch.fit(X_train, y_train)
print('best parameters {}, score {}'.format(rsearch.best_params_, rsearch.best_score_))

best parameters {'subsample': 0.8000000000000003, 'n_estimators': 340, 'min_samples_split': 600, 'min_samples_leaf': 50, 'max_features': 'log2', 'max_depth': 11, 'loss': 'lad', 'learning_rate': 0.1}, score -0.004578712097575632


In [90]:
print('R-squared for train: {}'.format(round(rsearch.best_estimator_.score(X_train, y_train),2)))
print('R-squared for test: {}'.format(round(rsearch.best_estimator_.score(X_test, y_test),2)))

R-squared for train: 0.1
R-squared for test: 0.06
