# Using unlabelled, unfractionated datasets obtained from QExact and VOrbi instruments
* Datasets were searched against H_sapiens_Uniprot_SPROT_2017-04-12, Tryp_Pig_Bov sequence files using MSGFPlus
* Combined results with MASIC results (q <= 0.01) to get quantitation data

In [None]:
import Classification_Utils as cu
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Load and combine data from all tissues

In [None]:
df = pd.read_csv('FullPeptideQuant.txt', sep='\t', index_col='Peptide')
print(df.shape)

## Map each column to a corresponding label

In [None]:
tissues = ['Blood_Plasma', 'Blood_Serum', 'CSF', 'Liver', 'Monocyte', 'Ovary', 'Pancreas', 'Substantia_Nigra', 'Temporal_Lobe']
 
tissues_to_columns = cu.map_tissues_to_columns(df, tissues)

In [None]:
column_names = df.columns.values.tolist()
labels = cu.get_labels(column_names, tissues_to_columns)

In [None]:
df.head()

## Make train-test split

In [None]:
train_df, test_df, train_labels, test_labels = train_test_split(
    df.T, labels, 
    test_size=0.30,    # 30% of the data held out in test set
    random_state=0,    # Setting random_state ensures the same train/test split occurs each time this is run
    stratify=labels)   # Maintain ratio of tissues represented in each set

## Train various classifiers, using cross-validation to produce an accuracy score

In [None]:
NUM_SPLITS = 100 # number of train/test splits in cross validation

In [None]:
### For demonstration purposes: classification_utils functions being referenced 
"""
def fit_model(model, data, labels, num_splits, scoring):
    
    cv = StratifiedShuffleSplit(n_splits=num_splits, test_size=0.3, random_state=0)
    scores = cross_val_score(model, data, labels, cv=cv, scoring=scoring)
    
    print('%s: %0.2f (+/- %0.2f)' % (scoring, scores.mean(), scores.std() * 2))
    return model.fit(data, labels)
    
    
def knn_model_crossval(data, labels, num_splits, scoring='accuracy'):
    knn = KNeighborsClassifier()
    return fit_model(knn, data, labels, num_splits, scoring)
"""

### KNN

In [None]:
knn = cu.knn_model_crossval(train_df, train_labels, NUM_SPLITS)

### Logistic Regression

In [None]:
lr = cu.logistic_regression_model_crossval(train_df, train_labels, NUM_SPLITS)

### Naive Bayes
* Gaussian
* Multinomial

In [None]:
gnb = cu.bayes_gaussian_model_crossval(train_df, train_labels, NUM_SPLITS)

In [None]:
mnb = cu.bayes_multinomial_model_crossval(train_df, train_labels, NUM_SPLITS)

### SVC 

In [None]:
svc = cu.SVC_model_crossval(train_df, train_labels, NUM_SPLITS)

In [None]:
svc_pred = svc.predict(test_df)
svc_result = svc.score(test_df, test_labels)

print(svc_result)

### Aggregations
* Random Forest
* Gradient Boosting

In [None]:
rf = cu.randomforest_model_crossval(train_df, train_labels, NUM_SPLITS)

In [None]:
gbc = cu.gradient_boosting_crossval(train_df, train_labels, NUM_SPLITS)

## Classify Training Set

### Use models from notebook to predict new data

In [None]:
lr_pred = lr.predict(test_df)
lr_result = lr.score(test_df, test_labels)

mnb_pred = mnb.predict(test_df)
mnb_result = mnb.score(test_df, test_labels)

rf_pred = rf.predict(test_df)
rf_result = rf.score(test_df, test_labels)

svc_pred = svc.predict(test_df)
svc_result = svc.score(test_df, test_labels)

gbc_pred = gbc.predict(test_df)
gbc_result = gbc.score(test_df, test_labels)

gnb_pred = gnb.predict(test_df)
gnb_result = gnb.score(test_df, test_labels)

knn_pred = knn.predict(test_df)
knn_result = knn.score(test_df, test_labels)

In [None]:
print(lr_result)
print(mnb_result)
print(rf_result)
print(svc_result)
print(gbc_result)
print(gnb_result)
print(knn_result)

##  Confusion matrices of model's predictions on new data

In [None]:
cm_labels = list(set(gnb_pred.tolist() + test_labels))

cu.show_confusion_matrices(test_labels, gnb_pred, cm_labels)

## Classify Liver Cell Line Data

In [None]:
cell_line_df = pd.read_csv('TrainTestCellLineQuant.txt', sep='\t', index_col='Peptide')
cell_line_df = cell_line_df.filter(like='Cell_Line', axis=1) # Break off cell line data

train_features = train_df.columns.values.tolist()
cell_line_df = cell_line_df.T[train_features]

cell_line_labels = ['Liver' for i in range(10)]
cell_line_df.shape

In [None]:
lr_cell_line_pred = lr.predict(cell_line_df)
lr_cell_line_result = lr.score(cell_line_df, cell_line_labels)

mnb_cell_line_pred = mnb.predict(cell_line_df)
mnb_cell_line_result = mnb.score(cell_line_df, cell_line_labels)

rf_cell_line_pred = rf.predict(cell_line_df)
rf_cell_line_result = rf.score(cell_line_df, cell_line_labels)

svc = svc_models[0]
svc_cell_line_pred = svc.predict(cell_line_df)
svc_cell_line_result = svc.score(cell_line_df, cell_line_labels)

gbc_cell_line_pred = gbc.predict(cell_line_df)
gbc_cell_line_result = gbc.score(cell_line_df, cell_line_labels)

gnb_cell_line_pred = gnb.predict(cell_line_df)
gnb_cell_line_result = gnb.score(cell_line_df, cell_line_labels)

knn_cell_line_pred = knn.predict(cell_line_df)
knn_cell_line_result = knn.score(cell_line_df, cell_line_labels)

In [None]:
print(lr_cell_line_result)
print(mnb_cell_line_result)
print(rf_cell_line_result)
print(svc_cell_line_result)
print(gbc_cell_line_result)
print(gnb_cell_line_result)
print(knn_cell_line_result)

##  Confusion matrices of cell line predictions 

In [None]:
knn_cell_line_pred

In [None]:
cellline_cm_labels = list(set(['Liver'] + gnb_cell_line_pred.tolist()))

cu.show_confusion_matrices(cell_line_labels, gnb_cell_line_pred, cellline_cm_labels)