# Comparing blood plasma, blood serum, blood vessel endothelium, and monocyte data

In [11]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [12]:
import Classification_Utils as cu
import MaxQuant_Postprocessing_Functions as mq
import numpy as np
from os import listdir
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.externals import joblib
import time

## Load and clean data

In [13]:
BRAIN_DERIVATIVES_DIR = 'F:\Brain_Derivatives\\'

files_dir = BRAIN_DERIVATIVES_DIR
file_paths = listdir(files_dir) 

df = cu.combine_csvs(files_dir, file_paths)
df = df.drop(['\n'])

In [14]:
mq.log2_normalize(df)

df_min = df.min().min()
impute_val = df_min/2
df = df.fillna(impute_val)

# median normalize
mq.median_normalize(df)

  df.iloc[:,:] = np.log2(df.iloc[:,:])


## Map each column to a corresponding label

In [19]:
tissues = ['Substantia_Nigra', 'Temporal_Lobe']
 
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)

In [20]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

In [21]:
df = cu.keep_percentile_features(df, labels, 25)

## PCA plot, Pearson

In [22]:
image_dir = r'D:\Images\Brain_Derivatives\\'

column_to_color = mq.map_colors(tissues, tissues_to_columns, 4)

In [23]:
mq.make_seaborn_boxplot(df, image_dir, 'Median_normalized_boxplots', column_to_color)

In [24]:
scaled_data = df.T

pca = PCA() # create a PCA object
pca.fit(scaled_data) # do the math
pca_data = pca.transform(scaled_data) # get PCA coordinates for dataframe

pca_3 = PCA(n_components=3) 
pca_3.fit(scaled_data)
pca_data_3 = pca_3.transform(scaled_data)

per_var, pca_labels = mq.make_scree_plot(pca, image_dir)
per_var_3, pca_labels_3 = mq.make_scree_plot(pca_3, image_dir, '3 Dimensional Scree')

mq.draw_pca_graph2(column_names, pca_data, image_dir, column_to_color, per_var, pca_labels, tissues, tissues_to_columns, 'PCA 25 Percentile')

mq.draw_3d_pca(column_names, pca_data_3, image_dir, column_to_color, per_var_3, pca_labels_3, tissues, tissues_to_columns, '3D PCA 25 Percentile')

  "matplotlib is currently using a non-GUI backend, "


## Classify

In [25]:
NUM_FOLDS = 6
transformed_df = df.T

In [26]:
lr = cu.logistic_regression_model_crossval(transformed_df, labels, NUM_FOLDS)

Scores: [ 0.80952381  0.95238095  0.95238095  0.95238095  1.          0.95238095]
accuracy: 0.94 (+/- 0.12)


## Find distinguishing peptides