In [1641]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [1642]:
import Classification_Utils as cu
import MaxQuant_Postprocessing_Functions as mq
import pandas as pd
import re
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [1643]:
%%html
<style>
  table {margin-left: 0 !important;}
</style>

# Classify mouse tissue using protein and peptide abundance data
* Use proteinGroups.txt or peptides.txt outputted from MaxQuant
* SVC variations
* K neighbors
* Decision tree
* Logistic Regression
* Naive Bayes (Gaussian and Multinomial)
* Gradient Boosting

** With train-test split, test_size = 0.4:**

Algorithm | Accuracy Score
:-----:|:-----:
SVC kernel = linear | 1.0
LinearSVC | 0.75 \*
SVC kernel = rbf | 0.167
SVC kernel = poly | 1.0
KNN | 1.0
Decision Tree | 0.67 \*

\* varies

In [1644]:
PLOT_PCA = False

## Load and clean mouse data

In [1645]:
file = "D:\proteinGroups.txt"

df = mq.load_df(file)
df = mq.clean_weakly_identified(df)
df = mq.remove_dup_proteinIDs(df)

iBAQ_df = mq.slice_by_column(df, 'protein', 'iBAQ ')

# Rename columns so that all instances "before" string are replaced with "after" string
def rename_columns(df, before, after):
    columns = df.columns.values.tolist()
    new_columns = []
    for column in columns:
        new_column = re.sub(before, after, column)
        new_columns.append(new_column)
        
    return new_columns

iBAQ_df.columns = rename_columns(iBAQ_df, 'Adult', 'Mouse')

groups = ['Brain', 'Heart', 'Kidney', 'Liver', 'Lung']
organ_columns = {} # 'Liver': ['iBAQ 04_Liver', 'iBAQ 05_Liver', ...]
organ_counts = {} # 'Liver': 
    
iBAQ_df = mq.filter_low_observed(iBAQ_df, groups, organ_columns, organ_counts)

### Function to replace low abundance values with 0

In [1646]:
### Replace low abundance values with 0
def replace_with_zero(x):
    min_val = 7
    
    if type(x) is str:
        return x
    
    elif pd.isnull(x) or x < min_val:
        return 0
    
    else:
        return x
    
#iBAQ_df.applymap(replace_with_zero)

## Normalize data and impute missing values with (data frame minimum/2)

In [1647]:
mq.log2_normalize(iBAQ_df)
mq.median_normalize(iBAQ_df)

iBAQ_df['Majority protein IDs'] = iBAQ_df['Majority protein IDs'].str[:-6] # strip off '_Mouse'
iBAQ_df.set_index('Majority protein IDs', inplace = True)

df_min = iBAQ_df.min().min()
impute_val = df_min/2
iBAQ_df = iBAQ_df.fillna(impute_val)

#iBAQ_df = mq.impute_missing(iBAQ_df) TODO change method to work whether df is indexed or not

  df.iloc[:,1:] = np.log2(df.iloc[:,1:])


## Map each column name to a corresponding label

In [1648]:
"""
Args: 
    df (dataframe)
    columns (list of strings): list of all column names in df
    organ_to_columns (dict): mapping of each organ to its column names {str: list of str}
    
Returns: 
    List of strings representing the labels for each dataframe column
"""
def get_labels(df, columns, organ_to_columns):
    labels = []

    for column in columns:
        key = next(key for key, value in organ_to_columns.items() if column in value)
        labels.append(key)
        
    return labels

In [1649]:
iBAQ_df = iBAQ_df[organ_columns['Brain'] + organ_columns['Heart'] + organ_columns['Kidney'] + organ_columns['Liver'] + organ_columns['Lung']]

columns = iBAQ_df.columns.values.tolist()
col_labels = get_labels(iBAQ_df, columns, organ_columns)

print(columns)
print(col_labels)

['iBAQ Mouse_07_Brain', 'iBAQ Mouse_08_Brain', 'iBAQ Mouse_09_Brain', 'iBAQ Mouse_10_Brain', 'iBAQ Mouse_11_Brain', 'iBAQ Mouse_12_Brain', 'iBAQ Mouse_07_Heart', 'iBAQ Mouse_08_Heart', 'iBAQ Mouse_09_Heart', 'iBAQ Mouse_10_Heart', 'iBAQ Mouse_11_Heart', 'iBAQ Mouse_12_Heart', 'iBAQ Mouse_07_Kidney', 'iBAQ Mouse_08_Kidney', 'iBAQ Mouse_09_Kidney', 'iBAQ Mouse_10_Kidney', 'iBAQ Mouse_11_Kidney', 'iBAQ Mouse_12_Kidney', 'iBAQ Mouse_04_Liver', 'iBAQ Mouse_05_Liver', 'iBAQ Mouse_06_Liver', 'iBAQ Mouse_07_Liver', 'iBAQ Mouse_08_Liver', 'iBAQ Mouse_09_Liver', 'iBAQ Mouse_07_Lung', 'iBAQ Mouse_08_Lung', 'iBAQ Mouse_09_Lung', 'iBAQ Mouse_10_Lung', 'iBAQ Mouse_11_Lung', 'iBAQ Mouse_12_Lung']
['Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


## Split data and labels into test and train groups
* X_train and X_test represent raw subsets of the original dataframe
* X_t_train and X_t_test represent data transformed by PCA

In [1650]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import preprocessing

# Transpose df so that proteins are columns (components)
X_train, X_test, y_train, y_test = cross_validation.train_test_split(iBAQ_df.T, col_labels, test_size=0.5, random_state=0, stratify=col_labels)

print(X_train.shape)
print(X_test.shape)

(15, 4399)
(15, 4399)


### Threshold all data to simulate lower-quality data with fewer attributes

In [1651]:
iBAQ_t = iBAQ_df.T

iBAQ_oneprotein_df = iBAQ_t.drop(X_test.columns[list(range(num_rows-1))], axis=1)
iBAQ_sixteenth_percent_df = iBAQ_t.drop(X_test.columns[list(range(math.floor(num_rows*(1599/1600))))], axis=1)
iBAQ_eighthpercent_df = iBAQ_t.drop(X_test.columns[list(range(math.floor(num_rows*(799/800))))], axis=1)
iBAQ_quarterpercent_df = iBAQ_t.drop(X_test.columns[list(range(math.floor(num_rows*(399/400))))], axis=1)
iBAQ_halfpercent_df = iBAQ_t.drop(X_test.columns[list(range(math.floor(num_rows*(199/200))))], axis=1)
iBAQ_onepercent_df = iBAQ_t.drop(X_test.columns[list(range(math.floor(num_rows*(99/100))))], axis=1)
iBAQ_twentieth_df = iBAQ_t.drop(X_test.columns[list(range(math.floor(num_rows*(95/100))))], axis=1)
iBAQ_tenth_df = iBAQ_t.drop(X_test.columns[list(range(math.floor(num_rows*(9/10))))], axis=1)
iBAQ_quarter_df = iBAQ_t.drop(X_test.columns[list(range(math.floor(num_rows*(3/4))))], axis=1)
iBAQ_half_df = iBAQ_t.drop(X_test.columns[list(range(num_rows//2))], axis=1)
iBAQ_three_quarters_df = iBAQ_t.drop(X_test.columns[list(range(num_rows//4))], axis=1)
iBAQ_nine_tenths_df = iBAQ_t.drop(X_test.columns[list(range(num_rows//10))], axis=1)

In [1652]:
iBAQ_df = iBAQ_df
iBAQ_df.shape

(4399, 30)

### Threshold Test data to simulate lower-quality data with fewer attributes; classification on mixed-quality data

In [1654]:
from sklearn.feature_selection import SelectPercentile
import math

num_rows = X_test.shape[1]

oneprotein_df = X_test.drop(X_test.columns[list(range(num_rows-1))], axis=1)
sixteenth_percent_df = X_test.drop(X_test.columns[list(range(math.floor(num_rows*(1599/1600))))], axis=1)
eighth_percent_df = X_test.drop(X_test.columns[list(range(math.floor(num_rows*(799/800))))], axis=1)
quarter_percent_df = X_test.drop(X_test.columns[list(range(math.floor(num_rows*(399/400))))], axis=1)
half_percent_df = X_test.drop(X_test.columns[list(range(math.floor(num_rows*(199/200))))], axis=1)
one_percent_df = X_test.drop(X_test.columns[list(range(math.floor(num_rows*(99/100))))], axis=1)
twentieth_df = X_test.drop(X_test.columns[list(range(math.floor(num_rows*(95/100))))], axis=1)
tenth_df = X_test.drop(X_test.columns[list(range(math.floor(num_rows*(9/10))))], axis=1)
quarter_df = X_test.drop(X_test.columns[list(range(math.floor(num_rows*(3/4))))], axis=1)
half_df = X_test.drop(X_test.columns[list(range(num_rows//2))], axis=1)
three_quarters_df = X_test.drop(X_test.columns[list(range(num_rows//4))], axis=1)
nine_tenths_df = X_test.drop(X_test.columns[list(range(num_rows//10))], axis=1)

### By Percentile
#X_test = SelectPercentile(percentile=10).fit_transform(X_test, y_test)

### Randomly
X_test = oneprotein_df

### Pairwise ratios
kbest_df = cu.keep_k_best_features(iBAQ_df, col_labels, 50)
pairwise_df = cu.pairwise_transform(kbest_df)

print(X_test.shape)

(15, 1)


In [None]:
print(tenth_df.shape)
print(quarter_df.shape)
print(half_df.shape)
print(three_quarters_df.shape)
print(nine_tenths_df.shape)
print(num_rows)

In [1655]:
### Create "filler" array containing the imputed value to stand in for attributes that have been removed

total_attributes = X_train.shape[1]

filler_array = np.full((X_test.shape[0], total_attributes - X_test.shape[1]), impute_val)
filler_array.shape

(15, 4398)

In [1656]:
### Combine filler array to reduced X_test to match training data shape for classification

X_test = np.concatenate((X_test, filler_array), axis=1)
X_test.shape

(15, 4399)

### Threshold portion of training data to simulate training on a mix of high- and low- quality data. Test with cross-validation

In [1657]:
## Combine X_test back with X_train

thresholded_df = np.concatenate((X_train.T, X_test.T), axis=1)

thresholded_labels = y_train + y_test
thresholded_df = pd.DataFrame(thresholded_df, columns = thresholded_labels)

thresholded_df.set_index(iBAQ_df.index, inplace=True)

print(thresholded_df.shape)

(4399, 30)


## Draw PCA plots

In [None]:
import matplotlib.pyplot as plt

if PLOT_PCA:
    base_dir = 'D:\\Images\\Mouse_Data_Thresholding\\Original_'
    color_mapping = mq.map_colors(groups, organ_columns)
    columns = iBAQ_df.columns.values.tolist()

    whole_df_pca, whole_df_pca_data = mq.do_pca(iBAQ_df.copy(), 'protein', scale=False)
    per_var, labels = mq.make_scree_plot(whole_df_pca, base_dir)
    mq.draw_pca_graph(columns, whole_df_pca_data, base_dir, color_mapping, per_var, labels)

In [None]:
pca = PCA(n_components=2)
#X_train = preprocessing.scale(X_train)
pca.fit(X_train)
X_t_train = pca.transform(X_train)
X_t_test = pca.transform(X_test)

print(X_t_train.shape)
print(X_t_test.shape)
print(y_train)

In [None]:
print(mq.top_n_loading_scores(pca, iBAQ_df, 5))

## Classify

### SVC and Variations

In [None]:
#########################
#
# Basic SVC Classification with train-test split
#
#########################

from sklearn.metrics import accuracy_score

clf = SVC(C=1)
clf.fit(X_t_train, y_train)
y_pred = clf.predict(X_t_test)

print('score', accuracy_score(y_pred, y_test))
print('pred label', clf.predict(X_t_test))
print('actual', y_test)

In [1661]:
#########################
#
# Basic SVC Classification with cross-validation
#
#########################
clf2 = SVC(C=1)
#scores = cross_val_score(clf2, iBAQ_df, col_labels, cv=4)                 # Whole df
scores = cross_val_score(clf2, thresholded_df.T, thresholded_labels, cv=4)  # Thresholded
#scores = cross_val_score(clf2, pairwise_df.T, col_labels, cv=4)              # Pairwise
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 1.   0.6  0.4  0.4]
Accuracy: 0.60 (+/- 0.49)


In [None]:
from sklearn.svm import LinearSVC

#########################
#
# SVC variations with train-test split
#
#########################

def try_SVC_models(X_train, y_train, X_test, y_test):
    C = 1.0  # SVM regularization parameter
    models = (SVC(kernel='linear', C=C),
              LinearSVC(C=C),
              SVC(kernel='rbf', gamma=0.7, C=C),
              SVC(kernel='poly', degree=3, C=C))

    # Fit all the models
    models = (clf.fit(X_train, y_train) for clf in models)

    for model in models:
        model_y_pred = model.predict(X_test)
        print('\n*** Model: ', model, '\n')
        print('score', accuracy_score(model_y_pred, y_test))
        print('pred label', model_y_pred)
        print('actual', y_test)

In [None]:
try_SVC_models(X_t_train, y_train, X_t_test, y_test)

### K Neighbors

#### Train-test split 

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_t_train, y_train)
y_pred = knn.predict(X_t_test)

print('score', accuracy_score(y_pred, y_test))
print('pred', y_pred)
print('actual', y_test)

#### Cross-validation

In [1665]:
knn2 = KNeighborsClassifier()
#scores = cross_val_score(knn2, iBAQ_df, col_labels, cv=4)
scores = cross_val_score(knn2, thresholded_df.T, thresholded_labels, cv=4)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

knn2.fit(iBAQ_df.T, col_labels)

[ 0.2  0.4  0.2  0.4]
Accuracy: 0.30 (+/- 0.20)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

#### SKLearn Pipeline: Train-test split, PCA transformation

In [None]:
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([('scaling', StandardScaler()),
                 ('pca', PCA()),
                 ('knn', KNeighborsClassifier())])

pipe.fit(X_train, y_train)

#print(cross_val_score(pipe, iBAQ_df.T, col_labels))

pipe_pred = pipe.predict(X_test)
print('score', accuracy_score(pipe_pred, y_test))
print('pred', pipe_pred)
print('actual', y_test)

### Decision Tree

In [None]:
decision_tree_clf = tree.DecisionTreeClassifier()
decision_tree_clf = decision_tree_clf.fit(X_t_train, y_train)
dt_pred = decision_tree_clf.predict(X_t_test)

print('score', accuracy_score(dt_pred, y_test))
print('pred', dt_pred)
print('actual', y_test)

In [1708]:
dt2 = tree.DecisionTreeClassifier()
#scores= cross_val_score(dt2, iBAQ_df, col_labels, cv=4)
scores = cross_val_score(dt2, thresholded_df.T, thresholded_labels, cv=4)
print(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.8  0.7  0.4  0.8]
Accuracy: 0.68 (+/- 0.33)


### Logistic Regression

In [1709]:
#lr = cu.logistic_regression_model_crossval(iBAQ_df, col_labels, 4)
lr = cu.logistic_regression_model_crossval(thresholded_df.T, thresholded_labels, 4)

Scores: [ 1.   0.6  0.2  0.4]
accuracy: 0.55 (+/- 0.59)


### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb = gnb.fit(X_t_train, y_train)
gnb_pred = gnb.predict(X_t_test)

print('score', accuracy_score(gnb_pred, y_test))
print('pred', gnb_pred)
print('actual', y_test)

In [1710]:
#gnb2 = cu.bayes_gaussian_model_crossval(iBAQ_df, col_labels, 4)
gnb2 = cu.bayes_gaussian_model_crossval(thresholded_df.T, thresholded_labels, 4)

Scores: [ 0.8  0.5  0.2  0.2]
accuracy: 0.42 (+/- 0.50)


In [None]:
mnb = cu.bayes_multinomial_model_crossval(thresholded_df.T, thresholded_labels, 4)

### Gradient Boosting

In [None]:
gbc = cu.gradient_boosting_crossval(thresholded_df.T, thresholded_labels, 4)

In [None]:
gbc_grid = cu.gbc_grid_search(4, 1)

gbc_grid.fit(thresholded_df.T, thresholded_labels)

print('Best Gradient Boosting parameters:\n', gbc_grid.best_params_)
print('\nBest Cross-Validation score:\n', gbc_grid.best_score_)
#print('\nBest F1-score:\n', rf_grid.best_score_)

In [None]:
mlp_grid = cu.mlp_grid_search(4, 1)

mlp_grid.fit(thresholded_df.T, thresholded_labels)

print('Best MLPClassifier parameters:\n', mlp_grid.best_params_)
print('\nBest Cross-Validation score:\n', mlp_grid.best_score_)

# Classify using Peptide Data
* Use peptides.txt output from MaxQuant
* SVC varations
* K nearest neighbors
* Decision tree

** With train-test split, test_size = 0.4: **

Algorithm | Accuracy Score
:-----:|:-----:
SVC kernel = linear | 1.0
LinearSVC | 1.0 \*
SVC kernel = rbf | 0.167
SVC kernel = poly | 0.917
KNN | 1.0
Decision Tree | 1.0 \*

\* varies

## Load and clean peptide data

In [None]:
peptide_file = "D:\peptides.txt"

peptide_df = mq.load_df(peptide_file)
peptide_df = mq.slice_by_column(peptide_df, 'peptide', 'LFQ')
peptide_df.columns = rename_columns(peptide_df, 'Adult', 'Mouse')

peptide_organ_columns = {}
peptide_organ_counts = {}
peptide_df = mq.filter_low_observed(peptide_df, groups, peptide_organ_columns, peptide_organ_counts)
mq.log2_normalize(peptide_df)
mq.median_normalize(peptide_df)

peptide_df = peptide_df.replace(r'\n','', regex=True)
peptide_df.set_index('Sequence', inplace = True)
peptide_df = mq.impute_missing(peptide_df)

peptide_columns = peptide_df.columns.values.tolist()
peptide_labels = get_labels(peptide_df, peptide_columns, peptide_organ_columns)
print(peptide_labels)

## Split data and labels into test and train groups

In [None]:
scaled_peptide_data = preprocessing.scale(peptide_df.T)

### Randomly split:
peptide_X_train, peptide_X_test, peptide_y_train, peptide_y_test = cross_validation.train_test_split(peptide_df.T, peptide_labels, test_size=0.4, random_state=0, stratify=peptide_labels)

peptide_pca = PCA(n_components=4)
peptide_pca.fit(peptide_X_train)
peptide_X_t_train = peptide_pca.transform(peptide_X_train)
peptide_X_t_test = peptide_pca.transform(peptide_X_test)

print(peptide_X_t_train.shape)
print(peptide_X_t_test.shape)
print(peptide_y_train)

## Draw PCA plots for peptide data

In [None]:
peptide_dir = base_dir + 'Mouse_Peptide_'
peptide_color_mapping = mq.map_colors(groups, peptide_organ_columns)
columns = peptide_df.columns.values.tolist()

peptide_pca, peptide_pca_data = mq.do_pca(peptide_df.copy(), scale=False)
peptide_per_var, peptide_labels = mq.make_scree_plot(peptide_pca, peptide_dir)
mq.draw_pca_graph(columns, peptide_pca_data, peptide_dir, peptide_color_mapping, peptide_per_var, peptide_labels)


## SVC

In [None]:
peptide_clf = SVC()
peptide_clf.fit(peptide_X_t_train, peptide_y_train)
peptide_y_pred = peptide_clf.predict(peptide_X_t_test)

print('score', accuracy_score(peptide_y_pred, peptide_y_test))
print('pred label', peptide_y_pred)
print('actual', peptide_y_test)

In [None]:
try_SVC_models(peptide_X_t_train, peptide_y_train, peptide_X_t_test, peptide_y_test)

## K Nearest Neighbors

In [None]:
peptide_knn = KNeighborsClassifier()
peptide_knn.fit(peptide_X_t_train, peptide_y_train)
peptide_y_pred = peptide_knn.predict(peptide_X_t_test)

print('score', accuracy_score(peptide_y_pred, peptide_y_test))
print('pred', peptide_y_pred)
print('actual', peptide_y_test)

## Decision Tree

In [None]:
peptide_dt_clf = tree.DecisionTreeClassifier()
peptide_dt_clf = peptide_dt_clf.fit(peptide_X_t_train, peptide_y_train)
peptide_dt_pred = peptide_dt_clf.predict(peptide_X_t_test)

print('score', accuracy_score(peptide_dt_pred, peptide_y_test))
print('pred', peptide_dt_pred)
print('actual', peptide_y_test)

In [None]:
### Confusion matrix shows which labels are being misclassified
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = confusion_matrix(peptide_y_test, peptide_dt_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=groups,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=groups, normalize=True,
                      title='Normalized confusion matrix')

plt.show()