# Comparing blood plasma and blood serum with different numbers of samples

In [13]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [14]:
import Classification_Utils as cu
import MaxQuant_Postprocessing_Functions as mq
import numpy as np
from os import listdir
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.externals import joblib
import time

## Load and clean data

In [15]:
BLOOD_DERIVATIVES_DIR = 'F:\Serum_And_Plasma\\'

files_dir = BLOOD_DERIVATIVES_DIR
file_paths = listdir(files_dir) 

df = cu.combine_csvs(files_dir, file_paths)
df = df.drop(['\n'])

original_df = df.copy()

In [16]:
mq.log2_normalize(df)

df_min = df.min().min()
impute_val = df_min/2
df = df.fillna(impute_val)

# median normalize
mq.median_normalize(df)

  df.iloc[:,:] = np.log2(df.iloc[:,:])


## Map each column to a corresponding label

In [17]:
tissues = ['Blood_Plasma', 'Blood_Serum']
 
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)

In [18]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

In [19]:
df = cu.keep_percentile_features(df, labels, 100)
features_to_keep = df.index.values.tolist()

## PCA plot, Pearson

In [20]:
image_dir = r'D:\Images\Blood_Derivatives\\'

column_to_color = mq.map_colors(tissues, tissues_to_columns, 4)

In [21]:
mq.make_seaborn_boxplot(df, image_dir, 'Median_normalized_boxplots', column_to_color)

In [23]:
scaled_data = df.T

pca = PCA() # create a PCA object
pca.fit(scaled_data) # do the math
pca_data = pca.transform(scaled_data) # get PCA coordinates for dataframe

pca_3 = PCA(n_components=3) 
pca_3.fit(scaled_data)
pca_data_3 = pca_3.transform(scaled_data)

per_var, pca_labels = mq.make_scree_plot(pca, image_dir)
per_var_3, pca_labels_3 = mq.make_scree_plot(pca_3, image_dir, '3 Dimensional Scree')

mq.draw_pca_graph2(column_names, pca_data, image_dir, column_to_color, per_var, pca_labels, tissues, tissues_to_columns, 'Blood Derivatives PCA')

mq.draw_3d_pca(column_names, pca_data_3, image_dir, column_to_color, per_var_3, pca_labels_3, tissues, tissues_to_columns, 'Blood Derivatives 3D PCA')

  "matplotlib is currently using a non-GUI backend, "


## Classify

In [11]:
NUM_FOLDS = 100
transformed_df = df.T

In [None]:
knn = cu.knn_model_crossval(transformed_df, labels, NUM_SPLITS)

In [12]:
lr = cu.logistic_regression_model_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 0.80952381  0.95238095  0.95238095  0.95238095  1.          0.95238095]
accuracy: 0.94 (+/- 0.12)


In [None]:
gnb = cu.bayes_gaussian_model_crossval(transformed_df, labels, NUM_SPLITS)

In [None]:
mnb = cu.bayes_multinomial_model_crossval(transformed_df, labels, NUM_SPLITS)

In [None]:
svc_models = cu.SVC_models_crossval(transformed_df, labels, NUM_SPLITS)

In [None]:
rf = cu.randomforest_model_crossval(transformed_df, labels, NUM_SPLITS)

In [None]:
gbc = cu.gradient_boosting_crossval(transformed_df, labels, NUM_SPLITS)

### Save Models, train data, train features

In [None]:
finalized_model_folder = r'Trained_Models\Plasma_Serum_Thresholding\\'

In [None]:
models = [knn, lr, gnb, mnb, svc_models[0], rf, gbc]
model_names = ['knn', 'lr', 'gnb', 'mnb', 'svc', 'rf', 'gbc']

for model, name in zip(models, model_names):
    model_path = finalized_model_folder + model_name + '50_samples.pkl'
    joblib.dump(model, open(model_path, 'wb'))
    

In [None]:
features_path = finalized_model_folder + 'train_features_50_samples.pkl'
joblib.dump(features_to_keep, open(features_path, 'wb'))

data_path = finalized_model_folder + 'train_features_50_samples.pkl'
joblib.dump(original_df, open(data_path, 'wb'))

## Load Test Data

In [None]:
TEST_SET_DIR = 'F:\Serum_Plasma_Testset\\'

In [None]:
test_paths = listdir(TEST_SET_DIR) 
test_df = cu.combine_csvs(TEST_SET_DIR, test_paths)

test_labels = []
for col in test_data.columns.values.tolist():
    for tissue in tissues:
        if col.startswith(tissue):
            test_labels.append(tissue)
            continue

## Test on Test Set

In [None]:
mnb_pred = mnb.predict(test_df)
mnb_result = mnb.score(test_df, test_labels)

lr_pred = lr.predict(test_df)
lr_result = lr.score(test_df, test_labels)

svc_pred = svc.predict(test_df)
svc_result = svc.score(test_df, test_labels)

gnb_pred = gnb.predict(test_df)
gnb_result = gnb.score(test_df, test_labels)

gbc_pred = gbc.predict(test_df)
gbc_result = gbc.score(test_df, test_labels)

rf_pred = rf.predict(test_df)
rf_result = rf.score(test_df, test_labels)

knn_pred = knn.predict(test_df)
knn_result = knn.score(test_df, test_labels)

In [None]:
print(lr_result)
print(mnb_result)
print(rf_result)
print(svc_result)
print(gbc_result)
print(gnb_result)
print(knn_result)

## Confusion Matrix

In [None]:
cm_labels = list(set(mnb_pred.tolist() + test_labels))
cu.show_confusion_matrices(test_labels, mnb_pred, cm_labels, 'Multinomial NB ')