In [8]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [9]:
import MaxQuant_Postprocessing_Functions as mq
import re

In [10]:
#########################
#
# Load and clean mouse data
#
#########################

file = "D:\proteinGroups.txt"

df = mq.load_df(file)
df = mq.clean_weakly_identified(df)
df = mq.remove_dup_proteinIDs(df)

iBAQ_df = mq.slice_by_column(df, 'protein', 'iBAQ ')

# Rename columns so that all instances "before" string are replaced with "after" string
def rename_columns(df, before, after):
    columns = df.columns.values.tolist()
    new_columns = []
    for column in columns:
        new_column = re.sub(before, after, column)
        new_columns.append(new_column)
        
    return new_columns

iBAQ_df.columns = rename_columns(iBAQ_df, 'Adult', 'Mouse')

groups = ['Brain', 'Heart', 'Kidney', 'Liver', 'Lung']
organ_columns = {} # 'Liver': ['iBAQ 04_Liver', 'iBAQ 05_Liver', ...]
organ_counts = {} # 'Liver': 
    
iBAQ_df = mq.filter_low_observed(iBAQ_df, groups, organ_columns, organ_counts)

In [11]:
#########################
#
# Load and clean peptide data
#
#########################
peptide_file = "D:\peptides.txt"

peptide_df = mq.load_df(peptide_file)
peptide_df = mq.slice_by_column(peptide_df, 'peptide', 'LFQ')
peptide_df.columns = rename_columns(peptide_df, 'Adult', 'Mouse')
peptide_df = mq.filter_low_observed(peptide_df, groups, {}, {})
mq.log2_normalize(peptide_df)
mq.median_normalize(peptide_df)

peptide_df.set_index('Sequence', inplace = True)
peptide_df = mq.impute_missing(peptide_df)
print(peptide_df.head())

  return lib.map_infer(x.asobject, func)


                        LFQ intensity Mouse_04_Liver  \
Sequence                                               
\nAAAAAAAAAAAAAAAGAAGK                      8.000505   
\nAAAAADLANR                               24.899619   
\nAAAADGEPLHNEEER                          22.192468   
\nAAAAEGARPLER                              8.000505   
\nAAAAGALAPGPLPDLAAR                        8.000505   

                        LFQ intensity Mouse_05_Liver  \
Sequence                                               
\nAAAAAAAAAAAAAAAGAAGK                      8.000505   
\nAAAAADLANR                               24.548848   
\nAAAADGEPLHNEEER                          21.916383   
\nAAAAEGARPLER                              8.000505   
\nAAAAGALAPGPLPDLAAR                        8.000505   

                        LFQ intensity Mouse_06_Liver  \
Sequence                                               
\nAAAAAAAAAAAAAAAGAAGK                      8.000505   
\nAAAAADLANR                               20.

In [12]:
#########################
#
# Normalize data and impute missing values with data frame minimum/2
#
#########################

mq.log2_normalize(iBAQ_df)
mq.median_normalize(iBAQ_df)

iBAQ_df['Majority protein IDs'] = iBAQ_df['Majority protein IDs'].str[:-6]
iBAQ_df.set_index('Majority protein IDs', inplace = True)
iBAQ_df = mq.impute_missing(iBAQ_df)

  return lib.map_infer(x.asobject, func)


In [13]:
#########################
#
# Map each column name to a corresponding label
#
#########################

"""
Input: dataframe
Output: List of strings representing the labels for each dataframe column
"""
def get_labels(df):
    columns = iBAQ_df.columns.values.tolist()
    labels = []

    for column in columns:
        key = next(key for key, value in organ_columns.items() if column in value)
        labels.append(key)
        
    return labels

In [14]:
iBAQ_df = iBAQ_df[organ_columns['Brain'] + organ_columns['Heart'] + organ_columns['Kidney'] + organ_columns['Liver'] + organ_columns['Lung']]

labels = get_labels(iBAQ_df)
print(iBAQ_df.columns.values.tolist())
print(labels)

['iBAQ Mouse_07_Brain', 'iBAQ Mouse_08_Brain', 'iBAQ Mouse_09_Brain', 'iBAQ Mouse_10_Brain', 'iBAQ Mouse_11_Brain', 'iBAQ Mouse_12_Brain', 'iBAQ Mouse_07_Heart', 'iBAQ Mouse_08_Heart', 'iBAQ Mouse_09_Heart', 'iBAQ Mouse_10_Heart', 'iBAQ Mouse_11_Heart', 'iBAQ Mouse_12_Heart', 'iBAQ Mouse_07_Kidney', 'iBAQ Mouse_08_Kidney', 'iBAQ Mouse_09_Kidney', 'iBAQ Mouse_10_Kidney', 'iBAQ Mouse_11_Kidney', 'iBAQ Mouse_12_Kidney', 'iBAQ Mouse_04_Liver', 'iBAQ Mouse_05_Liver', 'iBAQ Mouse_06_Liver', 'iBAQ Mouse_07_Liver', 'iBAQ Mouse_08_Liver', 'iBAQ Mouse_09_Liver', 'iBAQ Mouse_07_Lung', 'iBAQ Mouse_08_Lung', 'iBAQ Mouse_09_Lung', 'iBAQ Mouse_10_Lung', 'iBAQ Mouse_11_Lung', 'iBAQ Mouse_12_Lung']
['Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


In [15]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import preprocessing

# Transpose so that proteins are columns (components)
# Scale data
scaled_data = preprocessing.scale(iBAQ_df.T)

#########################
#
# Split data and labels into test and train groups
#
#########################

### Randomly split:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(iBAQ_df.T, labels, test_size=0.4, random_state=0, stratify=labels)

"""
X_train = np.concatenate([scaled_data[:4, :], scaled_data[6:10, :], scaled_data[12:16, :], scaled_data[18:22, :], scaled_data[24:28, :], scaled_data[30:34, :]], axis=0)

X_test = np.concatenate([scaled_data[4:6, :], scaled_data[10:12, :], scaled_data[16:18, :], scaled_data[22:24, :], scaled_data[28:30, :], scaled_data[34:, :]], axis=0)

y_train = labels[:4] + labels[6:10] + labels[12:16] + labels[18:22] + labels[24:28] + labels[30:34]
y_test = labels[4:6] + labels[10:12] + labels[16:18] + labels[22:24] + labels[28:30] + labels[34:]
"""
print(X_train.shape)
print(X_test.shape)

(18, 4399)
(12, 4399)




In [16]:
#########################
#
# Draw PCA graph of data
#
#########################

import pandas as pd
import matplotlib.pyplot as plt

base_dir = 'D:\\Images\\Classifier\\'
color_mapping = mq.map_colors(groups, organ_columns)
columns = iBAQ_df.columns.values.tolist()

pca, pca_data = mq.do_pca(iBAQ_df.copy())
per_var, labels = mq.make_scree_plot(pca, base_dir)
mq.draw_pca_graph(columns, pca_data, base_dir, color_mapping, per_var, labels)

In [17]:
pca = PCA(n_components=2)
pca.fit(X_train)
X_t_train = pca.transform(X_train)
X_t_test = pca.transform(X_test)

print(X_t_train.shape)
print(X_t_test.shape)
print(y_train)

(18, 2)
(12, 2)
['Lung', 'Liver', 'Heart', 'Heart', 'Kidney', 'Liver', 'Heart', 'Kidney', 'Kidney', 'Liver', 'Brain', 'Kidney', 'Brain', 'Brain', 'Brain', 'Lung', 'Lung', 'Heart']


## SVC

In [18]:
#########################
#
# Basic SVC Classification
#
#########################

from sklearn.metrics import accuracy_score

clf = SVC()
clf.fit(X_t_train, y_train)
y_pred = clf.predict(X_t_test)

print('score', accuracy_score(y_pred, y_test))
print('pred label', clf.predict(X_t_test))
print('actual', y_test)

score 0.166666666667
pred label ['Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart'
 'Heart' 'Heart' 'Heart']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']


In [19]:
#########################
#
# SVC Variations
#
#########################


## K Neighbors

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_t_train, y_train)
y_pred = knn.predict(X_t_test)

print('score', accuracy_score(y_pred, y_test))
print('pred', y_pred)
print('actual', y_test)

score 1.0
pred ['Brain' 'Lung' 'Lung' 'Kidney' 'Heart' 'Liver' 'Liver' 'Heart' 'Liver'
 'Kidney' 'Lung' 'Brain']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']


## Testing classification of human lung

In [22]:
#########################
#
# Load and clean human data
#
#########################

human_lung_file = r'F:\Human_Lung_Raw_Files\LungMAP\combined\txt\human_lung_proteinGroups.txt'
human_groups = ['Human_Lung']

human_lung_df = mq.load_df(human_lung_file)
human_lung_df = mq.clean_weakly_identified(human_lung_df)
human_lung_df = mq.remove_dup_proteinIDs(human_lung_df)
        
human_lung_iBAQ_df = mq.slice_by_column(human_lung_df, 'protein', 'iBAQ ') 
    
human_lung_organ_columns = {} # 'Liver': ['iBAQ 04_Liver', 'iBAQ 05_Liver', ...]
human_lung_organ_counts = {} # 'Liver': 
    
human_lung_iBAQ_df = mq.filter_low_observed(human_lung_iBAQ_df, human_groups, human_lung_organ_columns, human_lung_organ_counts)

human_lung_iBAQ_df['Majority protein IDs'] = human_lung_iBAQ_df['Majority protein IDs'].str[:-6]
mq.log2_normalize(human_lung_iBAQ_df)
mq.median_normalize(human_lung_iBAQ_df)

human_lung_iBAQ_df.set_index('Majority protein IDs', inplace = True)
human_lung_iBAQ_df = mq.impute_missing(human_lung_iBAQ_df)

  return lib.map_infer(x.asobject, func)


In [23]:
#########################
#
# Join mouse data to human data so that human_t_test has same shape as trained data
#
#########################

combined_df = iBAQ_df.join(human_lung_iBAQ_df)
combined_df = mq.impute_missing(combined_df)
all_columns = combined_df.columns.values.tolist()

combined_pca, combined_pca_data = mq.do_pca(combined_df)

# Add mapping of 'Human_Lung' to all human lung columns, and new organ group to groups
organ_columns.update(human_lung_organ_columns)
groups.append('Human_Lung')

# Draw scree plot of all human and mouse tissues
combined_dir = base_dir + 'Human_'
combined_color_mapping = mq.map_colors(groups, organ_columns)

combined_per_var, combined_labels = mq.make_scree_plot(combined_pca, combined_dir)
mq.draw_pca_graph(all_columns, combined_pca_data, combined_dir, combined_color_mapping, combined_per_var, combined_labels)

combined_df.drop(combined_df.columns[:30], axis=1, inplace=True) # Drop mouse data

In [24]:
#########################
#
# Make the prediction
#
#########################

scaled_human_data = preprocessing.scale(combined_df.T)
human_t_test = pca.transform(scaled_human_data)

human_pred = knn.predict(human_t_test)
print('pred', human_pred)

pred ['Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart'
 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart'
 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart'
 'Heart' 'Heart']




In [None]:
### Randomly select train and test groups

## Decision Tree

## Random Forest

## Naive Bayes/LDA