# Using unlabelled, unfractionated datasets obtained from QExact and VOrbi instruments
* Datasets were searched against H_sapiens_Uniprot_SPROT_2017-04-12, Tryp_Pig_Bov sequence files using MSGFPlus
* Combined results with MASIC results (q <= 0.01) to get quantitation data

In [None]:
import Classification_Utils as cu
import MaxQuant_Postprocessing_Functions as mq
import numpy as np
from os import listdir
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.externals import joblib
from sklearn import preprocessing
import time

## Load and combine data from all tissues
Directory contents: one tab separated text file per tissue, containing abundance values for all datasets. The first column name is Peptide, and the rest of the column names are the names of each dataset prefixed with the tissue (e.g. Blood_Plasma_[dataset name])

In [None]:
TRAIN_SET_DIR = 'F:\High_Quality\\'

files_dir = TRAIN_SET_DIR 
file_paths = listdir(files_dir) 

df = cu.combine_csvs(files_dir, file_paths)

In [None]:
df.dropna(axis='index', how='all', inplace=True) # drop any rows where all values are missing
df = df.drop(['\n'])

original_df = df.copy()

print(df.shape)

## Clean data
* Log2 transform
* Impute missing values
* Mean/Median normalize

In [None]:
mq.log2_normalize(df)

df_min = df.min().min()
impute_val = df_min/2
df = df.fillna(impute_val)

mq.median_normalize(df)

## Map each column to a corresponding label

In [None]:
tissues = ['Blood_Plasma', 'Blood_Serum', 'CSF', 'Liver', 'Monocyte', 'Ovary', 'Pancreas', 'Substantia_Nigra', 'Temporal_Lobe']
 
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)

In [None]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

In [None]:
df.head()

### Optional step to transform data

In [None]:
percentile_to_keep = 100
df = cu.keep_percentile_features(df, labels, percentile_to_keep)
features_to_keep = df.index.values.tolist()

print(df.shape)

## Train various classifiers, using cross-validation to produce an accuracy score

In [None]:
NUM_SPLITS = 100 # number of train/test splits in cross validation
transformed_df = df.T

### Decision Tree

In [None]:
dt = cu.decisiontree_model_crossval(transformed_df, labels, NUM_SPLITS)

### KNN

In [None]:
knn = cu.knn_model_crossval(transformed_df, labels, NUM_SPLITS)

### Logistic Regression

In [None]:
lr = cu.logistic_regression_model_crossval(transformed_df, labels, NUM_SPLITS)

### Naive Bayes
* Gaussian
* Multinomial

In [None]:
gnb = cu.bayes_gaussian_model_crossval(transformed_df, labels, NUM_SPLITS)

In [None]:
mnb = cu.bayes_multinomial_model_crossval(transformed_df, labels, NUM_SPLITS)

### SVC variations

In [None]:
svc_models = cu.SVC_models_crossval(transformed_df, labels, NUM_SPLITS)

### Aggregations
* Random Forest
* Gradient Boosting

In [None]:
rf = cu.randomforest_model_crossval(transformed_df, labels, NUM_SPLITS)

In [None]:
gbc = cu.gradient_boosting_crossval(transformed_df, labels, NUM_SPLITS)

## Tune parameters of best models with Grid Search

In [None]:
GRID_SEARCH_FOLDS = 8
N_JOBS = 4

### Gradient Boosting grid search

In [None]:
gbc_grid = cu.gbc_grid_search(GRID_SEARCH_FOLDS, N_JOBS)

gbc_grid.fit(transformed_df, labels)

print('Best Gradient Boosting parameters:\n', gbc_grid.best_params_)
print('\nBest Cross-Validation score:\n', gbc_grid.best_score_)

In [None]:
finalized_model_folder = r'Trained_Models\\'
model_path = finalized_model_folder + 'gbc_grid.sav'
joblib.dump(gbc_grid, open(model_path, 'wb'))

###  Logistic Regression grid search

In [None]:
lr_grid = cu.lr_grid_search(GRID_SEARCH_FOLDS, N_JOBS)

lr_grid.fit(transformed_df, labels)

print('Best Logistic Regression parameters:\n', lr_grid.best_params_)
print('\nBest Cross-Validation score:\n', lr_grid.best_score_)

In [None]:
finalized_model_folder = r'Trained_Models\\'
model_path = finalized_model_folder + 'lr_grid.sav'
joblib.dump(lr_grid, open(model_path, 'wb'))

### Random Forest grid search

In [None]:
rf_grid = cu.rf_grid_search(GRID_SEARCH_FOLDS, N_JOBS)

rf_grid.fit(transformed_df, labels)

print('Best Random Forest parameters:\n', rf_grid.best_params_)
print('\nBest Cross-Validation score:\n', rf_grid.best_score_)

In [None]:
finalized_model_folder = r'Trained_Models\\'
model_path = finalized_model_folder + 'rf_grid.sav'
joblib.dump(rf_grid, open(model_path, 'wb'))

### SVC grid search

In [None]:
svc_grid = cu.svc_grid_search(GRID_SEARCH_FOLDS, N_JOBS)

svc_grid.fit(transformed_df, labels)

print('Best SVC parameters:\n', svc_grid.best_params_)
print('\nBest Cross-Validation score:\n', svc_grid.best_score_)

In [None]:
finalized_model_folder = r'Trained_Models\\'
model_path = finalized_model_folder + 'svc_grid.sav'
joblib.dump(svc_grid, open(model_path, 'wb'))

### KNN grid search

In [None]:
knn_grid = cu.knn_grid_search(GRID_SEARCH_FOLDS, N_JOBS)

knn_grid.fit(transformed_df, labels)

print('Best KNN parameters:\n', knn_grid.best_params_)
print('\nBest Cross-Validation score:\n', knn_grid.best_score_)

In [None]:
finalized_model_folder = r'Trained_Models\\'
model_path = finalized_model_folder + 'knn_grid.sav'
joblib.dump(knn_grid, open(model_path, 'wb'))

### Multinomial Naive Bayes grid search

In [None]:
mnb_grid = cu.mnb_grid_search(GRID_SEARCH_FOLDS, N_JOBS)

mnb_grid.fit(transformed_df, labels)

print('Best Multinomial Naive Bayes parameters:\n', mnb_grid.best_params_)
print('\nBest Cross-Validation score:\n', mnb_grid.best_score_)

In [None]:
finalized_model_folder = r'Trained_Models\\'
model_path = finalized_model_folder + 'mnb_grid.sav'
joblib.dump(mnb_grid, open(model_path, 'wb'))

## Classify Training Set

### Load new data (test set)

In [None]:
TEST_SET_DIR = 'F:\Test_Set\\'
test_paths = listdir(TEST_SET_DIR) 

test_data = cu.combine_csvs(TEST_SET_DIR, test_paths)

In [None]:
# Map test data columns to tissues
test_tissues_to_columns = cu.map_tissues_to_columns(test_data, tissues)
test_column_names = test_data.columns.values.tolist()
test_labels = cu.get_labels(test_column_names, test_tissues_to_columns)

In [None]:
test_data = cu.fit_new_data(original_df, test_data, features_to_keep)
test_data.head()

### Use models from notebook to predict new data

In [None]:
lr_pred = lr.predict(test_data)
lr_result = lr.score(test_data, test_labels)

mnb_pred = mnb.predict(test_data)
mnb_result = mnb.score(test_data, test_labels)

rf_pred = rf.predict(test_data)
rf_result = rf.score(test_data, test_labels)

svc_pred = svc_models[0].predict(test_data)
svc_result = svc_models[0].score(test_data, test_labels)

In [None]:
print(lr_result)
print(mnb_result)
print(rf_result)
print(svc_result)

In [None]:
gbc_pred = gbc.predict(test_data)
gbc_result = gbc.score(test_data, test_labels)

gnb_pred = gnb.predict(test_data)
gnb_result = gnb.score(test_data, test_labels)

knn_pred = knn.predict(test_data)
knn_result = knn.score(test_data, test_labels)

In [None]:
print(gbc_result)
print(gnb_result)
print(knn_result)

In [None]:
gbc_grid_pred = gbc_grid.predict(test_data)
gbc_grid_result = gbc_grid.score(test_data, test_labels)

rf_grid_pred = rf_grid.predict(test_data)
rf_grid_result = rf_grid.score(test_data, test_labels)

svc_grid_pred = svc_grid.predict(test_data)
svc_grid_result = svc_grid.score(test_data, test_labels)

In [None]:
print(gbc_grid_result)
print(rf_grid_result)
print(svc_grid_result)

##  Confusion matrices of model's predictions on new data

In [None]:
cm_labels = list(set(lr_pred.tolist() + test_labels))

cu.show_confusion_matrices(test_labels, lr_pred, cm_labels, 'Logistic Regression ')