Use the extracted feature metrics to train a target-decoy classifier. This will be used to classify features extracted for all identifications in the experiment library.

In [126]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [127]:
df = pd.read_pickle('/Users/darylwilding-mcbride/Downloads/experiments/dwm-test/target-decoy-models/library_sequences_in_run_190719_Hela_Ecoli_1to1_01.pkl')

In [148]:
df.columns

Index(['file_idx', 'sequence', 'charge', 'run_mz', 'run_scan', 'run_rt',
       'run_intensity', 'cv_mz', 'cv_scan', 'cv_rt', 'cv_intensity',
       'theoretical_mz', 'experiment_scan_mean', 'experiment_scan_std_dev',
       'experiment_scan_peak_width', 'experiment_rt_mean',
       'experiment_rt_std_dev', 'experiment_rt_peak_width',
       'experiment_intensity_mean', 'experiment_intensity_std_dev',
       'number_of_runs_identified', 'delta_mz', 'delta_mz_ppm', 'delta_scan',
       'delta_rt', 'target_coords', 'decoy_coords', 'target_metrics',
       'decoy_metrics'],
      dtype='object')

In [128]:
metrics = []
for row in df.itertuples():
    # target metrics
    d = row.target_metrics
    l = [d[key] for key in sorted(d)]
    l.append('target')
    metrics.append(tuple(l))
    # decoy metrics
    d = row.decoy_metrics
    l = [d[key] for key in sorted(d)]
    l.append('decoy')
    metrics.append(tuple(l))

In [129]:
metrics_names = [key for key in sorted(df.iloc[0].target_metrics)]
columns = metrics_names.copy()
columns.append('class_name')

In [130]:
metrics_names

['delta_mz_ppm',
 'delta_rt',
 'delta_scan',
 'fwhm_rt_0',
 'fwhm_scan_0',
 'geometric_mean_0_1',
 'geometric_mean_0_1_2',
 'isotope_0_1_mz_delta_ppm',
 'isotope_0_1_rt_delta',
 'isotope_0_1_scan_delta',
 'isotope_0_2_mz_delta_ppm',
 'isotope_0_2_rt_delta',
 'isotope_0_2_scan_delta',
 'monoisotope_auc_over_isotope_peak_auc_sum',
 'monoisotope_int_over_isotope_peak_int_sum',
 'mz_delta_ppm_std_dev_0',
 'mz_delta_ppm_std_dev_1',
 'number_of_frames_0',
 'number_of_frames_1',
 'number_of_frames_2',
 'number_of_missing_frames_0',
 'number_of_missing_frames_1',
 'number_of_missing_frames_2',
 'peak_base_width_rt_0',
 'peak_base_width_scan_0',
 'r_squared_phr',
 'rt_isotope_correlation',
 'rt_peak_symmetry_0',
 'rt_peak_symmetry_1',
 'rt_peak_symmetry_2',
 'scan_isotope_correlation',
 'scan_peak_symmetry_0',
 'scan_peak_symmetry_1',
 'scan_peak_symmetry_2']

In [131]:
metrics_df = pd.DataFrame(metrics, columns=columns)

In [132]:
metrics_df.head()

Unnamed: 0,delta_mz_ppm,delta_rt,delta_scan,fwhm_rt_0,fwhm_scan_0,geometric_mean_0_1,geometric_mean_0_1_2,isotope_0_1_mz_delta_ppm,isotope_0_1_rt_delta,isotope_0_1_scan_delta,...,r_squared_phr,rt_isotope_correlation,rt_peak_symmetry_0,rt_peak_symmetry_1,rt_peak_symmetry_2,scan_isotope_correlation,scan_peak_symmetry_0,scan_peak_symmetry_1,scan_peak_symmetry_2,class_name
0,0.83847,-0.001916,-0.058829,4.514827,32.5425,10.792904,10.743355,1.517528,-0.000721,0.002153,...,-3.032418,0.59171,0.729545,0.87526,1.076128,0.027259,0.870939,1.20203,0.892236,target
1,,,,,,,,,,,...,,,,,,,,,,decoy
2,0.12278,9.9e-05,0.009747,3.784793,11.950123,12.001511,11.697028,-0.393996,-0.000121,0.00112,...,-1.996434,0.925682,0.744829,0.917704,0.740364,0.486377,0.822585,1.069086,0.914356,target
3,,,,,,,,,,,...,,,,,,,,,,decoy
4,-0.498795,2.8e-05,-0.000129,3.480777,19.469426,10.782247,10.736806,1.671152,3.4e-05,0.003303,...,-0.837658,0.842586,0.647288,1.449018,1.408857,0.523684,0.977183,1.199703,1.188001,target


In [133]:
metrics_df.fillna(value=0.0, inplace=True)
metrics_df.r_squared_phr.replace((-np.inf, 0), inplace=True)

In [134]:
metrics_df.head()

Unnamed: 0,delta_mz_ppm,delta_rt,delta_scan,fwhm_rt_0,fwhm_scan_0,geometric_mean_0_1,geometric_mean_0_1_2,isotope_0_1_mz_delta_ppm,isotope_0_1_rt_delta,isotope_0_1_scan_delta,...,r_squared_phr,rt_isotope_correlation,rt_peak_symmetry_0,rt_peak_symmetry_1,rt_peak_symmetry_2,scan_isotope_correlation,scan_peak_symmetry_0,scan_peak_symmetry_1,scan_peak_symmetry_2,class_name
0,0.83847,-0.001916,-0.058829,4.514827,32.5425,10.792904,10.743355,1.517528,-0.000721,0.002153,...,-3.032418,0.59171,0.729545,0.87526,1.076128,0.027259,0.870939,1.20203,0.892236,target
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-3.032418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,decoy
2,0.12278,9.9e-05,0.009747,3.784793,11.950123,12.001511,11.697028,-0.393996,-0.000121,0.00112,...,-1.996434,0.925682,0.744829,0.917704,0.740364,0.486377,0.822585,1.069086,0.914356,target
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.996434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,decoy
4,-0.498795,2.8e-05,-0.000129,3.480777,19.469426,10.782247,10.736806,1.671152,3.4e-05,0.003303,...,-0.837658,0.842586,0.647288,1.449018,1.408857,0.523684,0.977183,1.199703,1.188001,target


In [135]:
X = metrics_df[metrics_names].values
y = metrics_df[['class_name']].values[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [136]:
from sklearn.preprocessing import MinMaxScaler

In [137]:
scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [138]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [143]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb.fit(X_train, y_train)
    print("learning rate: {}, accuracy score (training): {}, accuracy score (test): {}".format(learning_rate, round(gb.score(X_train, y_train),3), round(gb.score(X_test, y_test),3)))

learning rate: 0.05, accuracy score (training): 0.956, accuracy score (test): 0.9
learning rate: 0.1, accuracy score (training): 0.961, accuracy score (test): 0.9
learning rate: 0.25, accuracy score (training): 0.994, accuracy score (test): 0.9
learning rate: 0.5, accuracy score (training): 1.0, accuracy score (test): 0.9
learning rate: 0.75, accuracy score (training): 1.0, accuracy score (test): 0.9
learning rate: 1, accuracy score (training): 1.0, accuracy score (test): 0.9


In [146]:
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, y_train)
predictions = gb.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions, normalize='all'))
print()
print(confusion_matrix(y_test, predictions).ravel())
print()
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[0.25 0.05]
 [0.05 0.65]]

[ 5  1  1 13]

Classification Report
              precision    recall  f1-score   support

       decoy       0.83      0.83      0.83         6
      target       0.93      0.93      0.93        14

    accuracy                           0.90        20
   macro avg       0.88      0.88      0.88        20
weighted avg       0.90      0.90      0.90        20

