# Model Finalization Steps:
1. Select only 25% the original number of peptides
2. Reduce features with PCA
3. Recursive Feature Elimination
4. Grid Search classifier hyper-parameters

In [1]:
### Combine parent directory to path for access to Classification_Utils and FullPeptideQuant.txt
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import Classification_Utils as cu
import numpy as np
from os import listdir
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.externals import joblib
from sklearn.feature_selection import RFE, SelectPercentile
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import preprocessing
import time

## Load Dataframe

In [3]:
df = pd.read_csv('..\FullPeptideQuant.txt', sep='\t', index_col='Peptide')
print(df.shape)

(55676, 253)


## Map each column to a corresponding label

In [4]:
tissues = ['Blood_Plasma', 'Blood_Serum', 'CSF', 'Liver', 'Monocyte', 'Ovary', 'Pancreas', 'Substantia_Nigra', 'Temporal_Lobe']
 
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)

In [5]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

In [6]:
df.head()

Unnamed: 0_level_0,Blood_Plasma_CPTAC_TrypDige_undepleted_normal_19Apr13_Methow_13-02-13,Blood_Plasma_Darpa_2_human_02_23Jan17_Arwem_16-10-25,Blood_Plasma_OMICS_EBV_HP_UW001_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW002_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW003_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW004_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW005_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW006_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW007_8Apr16_Arwen_16-01-03,Blood_Plasma_OMICS_EBV_HP_UW008_8Apr16_Arwen_16-01-03,...,Temporal_Lobe_Alz_FX1P159_Guan_1_26Jul10_Andromeda_10-06-28,Temporal_Lobe_Alz_FX1P159_Guan_2_26Jul10_Andromeda_10-06-29,Temporal_Lobe_Alz_FX1P159_Guan_3_26Jul10_Andromeda_10-06-28,Temporal_Lobe_Alz_FX1P159_Guan_Typx2_1_26Jul10_Andromeda_10-06-29,Temporal_Lobe_Alz_FX1P159_Guan_Typx2_2_26Jul10_Andromeda_10-06-28,Temporal_Lobe_Alz_FX1P159_Guan_Typx2_3_26Jul10_Andromeda_10-06-29,Temporal_Lobe_Alz_FX2P57_IMAC_153_9May11_Hawk_11-04-02p,Temporal_Lobe_Alz_FX2P57_IMAC_161_20Apr11_Hawk_10-12-03p,Temporal_Lobe_Alz_FX2P57_IMAC_187_26Apr11_Hawk_10-12-03p,Temporal_Lobe_Alz_FX2P57_IMAC_212_11May11_Hawk_11-04-02p
Peptide,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-.DIQM*TQSPSTLSASVGDR.V,26.731951,22.187643,3.022208,29.328345,29.916272,3.022208,27.586706,30.458361,29.00413,31.023004,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
-.DIQM*TQSPSTLSASVGDRVTITCR.A,3.022208,3.022208,3.022208,30.633308,30.815586,3.022208,29.483431,32.564995,30.319263,32.368436,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
-.DIQMTQSPSTLSASVGDR.V,26.387537,28.015792,3.022208,3.022208,31.403752,3.022208,27.695976,29.779972,29.747784,30.255299,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
-.DIQMTQSPSTLSASVGDRVTITCR.A,3.022208,3.022208,3.022208,3.022208,32.585511,3.022208,3.022208,30.845879,30.983525,27.526416,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208
-.EVQLVETGGGLIQPGGSLR.L,24.54622,3.022208,3.022208,3.022208,26.732727,3.022208,28.163126,23.836245,3.022208,28.436388,...,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208,3.022208


## Make train-test split

In [7]:
train_df, test_df, train_labels, test_labels = train_test_split(
    df.T, labels, test_size=0.30, random_state=0, stratify=labels)

In [8]:
finalized_model_folder = r'Trained_Models\\'
test_df_path = finalized_model_folder + 'test_df'
joblib.dump(test_df, open(test_df_path, 'wb'))

test_labels_path = finalized_model_folder + 'test_labels'
joblib.dump(test_labels, open(test_labels_path, 'wb'))

## Tune parameters of best models with Grid Search

In [9]:
GRID_SEARCH_FOLDS = 8
N_JOBS = 1

from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier(random_state=0)

###  Logistic Regression grid search

In [20]:
from sklearn.linear_model import LogisticRegression

SOLVERS = ['liblinear', 'sag', 'saga']
C_OPTIONS = [.01, .1, 1, 10, 100, 1000]
    
lr_pipe = Pipeline([
    ('select_features', SelectPercentile(percentile=25)),
    ('reduce_features', PCA()),
    ('eliminate_features', RFE(estimator=estimator)),
    ('classify', LogisticRegression(random_state=0))])


lr_param_grid = {
    'classify__solver': SOLVERS,
    'classify__C': C_OPTIONS
}

lr_grid = GridSearchCV(lr_pipe, 
                       cv=GRID_SEARCH_FOLDS, 
                       n_jobs=N_JOBS, 
                       param_grid=lr_param_grid)
                       
lr_grid.fit(train_df, train_labels)

print('Best Logistic Regression parameters:\n', lr_grid.best_params_)
print('\nBest Cross-Validation score:\n', lr_grid.best_score_)







Best Logistic Regression parameters:
 {'classify__C': 0.01, 'classify__solver': 'liblinear'}

Best Cross-Validation score:
 0.9887005649717514


In [None]:
finalized_model_folder = r'Trained_Models\\'
model_path = finalized_model_folder + 'lr_grid.sav'
joblib.dump(lr_grid, open(model_path, 'wb'))

### Random Forest grid search

In [None]:
MAX_FEATURES = ['auto', 'sqrt', 'log2']
MIN_SAMPLES_SPLIT = [2, 3, 4, 5, 10]
N_ESTIMATORS = [25, 50, 100, 200]

rf_pipe = Pipeline([
    ('reduce_dim', SelectPercentile(percentile=25)),
    ('reduce_features', PCA()),
    ('eliminate_features', RFE(estimator=estimator)),
    ('classify', RandomForestClassifier(random_state=0))])

    
rf_param_grid = {
    'classify__n_estimators': N_ESTIMATORS,
    'classify__min_samples_split': MIN_SAMPLES_SPLIT,
    'classify__max_features': MAX_FEATURES
}

rf_grid = GridSearchCV(rf_pipe, 
                       cv=GRID_SEARCH_FOLDS, 
                       n_jobs=N_JOBS, 
                       param_grid=rf_param_grid)

rf_grid.fit(train_df, train_labels)

print('Best Random Forest parameters:\n', rf_grid.best_params_)
print('\nBest Cross-Validation score:\n', rf_grid.best_score_)

In [None]:
model_path = finalized_model_folder + 'rf_grid.sav'
joblib.dump(rf_grid, open(model_path, 'wb'))

### SVC grid search

In [None]:
from sklearn.svm import SVC

KERNELS = ['linear', 'rbf', 'poly']

svc_pipe = Pipeline([
    ('reduce_dim', SelectPercentile(percentile=25)),
    ('reduce_features', PCA()),
    ('eliminate_features', RFE(estimator=estimator)),
    ('classify', SVC(probability=True, random_state=0))])
 
svc_param_grid = {
    'classify__kernel': KERNELS,
    'classify__C': C_OPTIONS
}

svc_grid = GridSearchCV(svc_pipe, 
                       cv=GRID_SEARCH_FOLDS, 
                       n_jobs=N_JOBS, 
                       param_grid=svc_param_grid)

svc_grid.fit(train_df, train_labels)

print('Best SVC parameters:\n', svc_grid.best_params_)
print('\nBest Cross-Validation score:\n', svc_grid.best_score_)

In [None]:
model_path = finalized_model_folder + 'svc_grid.sav'
joblib.dump(svc_grid, open(model_path, 'wb'))

### KNN grid search

In [10]:
from sklearn.neighbors import KNeighborsClassifier

KERNELS = ['linear', 'rbf', 'poly']

knn_pipe = Pipeline([
    ('reduce_dim', SelectPercentile(percentile=25)),
    ('reduce_features', PCA()),
    ('eliminate_features', RFE(estimator=estimator)),
    ('classify', KNeighborsClassifier())])
 
N_NEIGHBORS = [1, 3, 5, 10, 20]
 
knn_param_grid = {
    'classify__n_neighbors': N_NEIGHBORS
}

knn_grid = GridSearchCV(knn_pipe, 
                       cv=GRID_SEARCH_FOLDS, 
                       n_jobs=N_JOBS, 
                       param_grid=knn_param_grid)

knn_grid.fit(train_df, train_labels)

print('Best KNN parameters:\n', knn_grid.best_params_)
print('\nBest Cross-Validation score:\n', knn_grid.best_score_)

Best KNN parameters:
 {'classify__n_neighbors': 1}

Best Cross-Validation score:
 0.966101694915


In [12]:
model_path = finalized_model_folder + 'knn_grid.sav'
joblib.dump(knn_grid, open(model_path, 'wb'))

### Multinomial Naive Bayes grid search

In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

mnb_pipe = Pipeline([
    ('reduce_dim', SelectPercentile(percentile=25)),
    ('reduce_features', PCA()),
    ('eliminate_features', RFE(estimator=estimator)),
    ('scale', MinMaxScaler()),  # MultinomialNB can't handle negative values produced by PCA, so scale values to [0, 1]
    ('classify', MultinomialNB())])
 
ALPHAS = [1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
 
mnb_param_grid = {
    'classify__alpha': ALPHAS
}

mnb_grid = GridSearchCV(mnb_pipe, 
                       cv=GRID_SEARCH_FOLDS, 
                       n_jobs=N_JOBS, 
                       param_grid=mnb_param_grid)

mnb_grid.fit(train_df, train_labels)

print('Best Multinomial NB parameters:\n', mnb_grid.best_params_)
print('\nBest Cross-Validation score:\n', mnb_grid.best_score_)

Best Multinomial NB parameters:
 {'classify__alpha': 0.1}

Best Cross-Validation score:
 0.864406779661


In [14]:
model_path = finalized_model_folder + 'mnb_grid.sav'
joblib.dump(mnb_grid, open(model_path, 'wb'))

### Gaussian Naive Bayes - no hyper-parameters to tune

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb_pipe = Pipeline([
    ('reduce_dim', SelectPercentile(percentile=25)),
    ('reduce_features', PCA()),
    ('eliminate_features', RFE(estimator=estimator)),
    ('classify', GaussianNB())])

gnb_pipe.fit(train_df, train_labels)

### Gradient Boosting grid search

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc_pipe = Pipeline([
    ('reduce_dim', SelectPercentile(percentile=25)),
    ('reduce_features', PCA()),
    ('eliminate_features', RFE(estimator=estimator)),
    ('classify', GradientBoostingClassifier(random_state=0))])
 
MAX_DEPTH = range(5,16,3)
    
gbc_param_grid = {
    'classify__n_estimators': N_ESTIMATORS,
    'classify__min_samples_split': MIN_SAMPLES_SPLIT,
    'classify__max_depth': MAX_DEPTH 
}

gbc_grid = GridSearchCV(gbc_pipe, 
                       cv=GRID_SEARCH_FOLDS, 
                       n_jobs=N_JOBS, 
                       param_grid=gbc_param_grid)

gbc_grid.fit(train_df, train_labels)

print('Best Gradient Boosting parameters:\n', gbc_grid.best_params_)
print('\nBest Cross-Validation score:\n', gbc_grid.best_score_)

In [None]:
model_path = finalized_model_folder + 'gbc_grid.sav'
joblib.dump(gbc_grid, open(model_path, 'wb'))

## Classify Training Set

In [17]:
mnb_grid_pred = mnb_grid.predict(test_df)
mnb_grid_result = mnb_grid.score(test_df, test_labels)

print(mnb_grid_result)

0.828947368421


In [15]:
lr_grid_pred = lr_grid.predict(test_df)
lr_grid_result = lr_grid.score(test_df, test_labels)

rf_grid_pred = rf_grid.predict(test_df)
rf_grid_result = rf_grid.score(test_df, test_labels)

svc_grid_pred = svc_grid.predict(test_df)
svc_grid_result = svc_grid.score(test_df, test_labels)

knn_grid_pred = knn_grid.predict(test_df)
knn_grid_result = knn_grid.score(test_df, test_labels)


mnb_grid_pred = mnb_grid.predict(test_df)
mnb_grid_result = mnb_grid.score(test_df, test_labels)

gnb_grid_pred = gnb_pipe.predict(test_df)
gnb_grid_result = gnb_pipe.score(test_df, test_labels)

gbc_grid_pred = gbc_grid.predict(test_df)
gbc_grid_result = gbc_grid.score(test_df, test_labels)


NameError: name 'lr_grid' is not defined

In [None]:
print(lr_grid_result)
print(rf_grid_result)
print(svc_grid_result)
print(knn_grid_result)
print(gnb_grid_result)
print(mnb_grid_result)
print(gbc_grid_result)

##  Confusion matrices of model's predictions on new data

In [None]:
cu.show_confusion_matrices(test_labels, svc_grid_pred, tissues)