In [514]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [515]:
import MaxQuant_Postprocessing_Functions as mq
import re


# Classify using Protein Data

In [516]:
#########################
#
# Load and clean mouse data
#
#########################

file = "D:\proteinGroups.txt"

df = mq.load_df(file)
df = mq.clean_weakly_identified(df)
df = mq.remove_dup_proteinIDs(df)

iBAQ_df = mq.slice_by_column(df, 'protein', 'iBAQ ')

# Rename columns so that all instances "before" string are replaced with "after" string
def rename_columns(df, before, after):
    columns = df.columns.values.tolist()
    new_columns = []
    for column in columns:
        new_column = re.sub(before, after, column)
        new_columns.append(new_column)
        
    return new_columns

iBAQ_df.columns = rename_columns(iBAQ_df, 'Adult', 'Mouse')

groups = ['Brain', 'Heart', 'Kidney', 'Liver', 'Lung']
organ_columns = {} # 'Liver': ['iBAQ 04_Liver', 'iBAQ 05_Liver', ...]
organ_counts = {} # 'Liver': 
    
iBAQ_df = mq.filter_low_observed(iBAQ_df, groups, organ_columns, organ_counts)

In [517]:
#########################
#
# Normalize data and impute missing values with data frame minimum/2
#
#########################

mq.log2_normalize(iBAQ_df)
mq.median_normalize(iBAQ_df)

iBAQ_df['Majority protein IDs'] = iBAQ_df['Majority protein IDs'].str[:-6] # strip off '_Mouse'
iBAQ_df.set_index('Majority protein IDs', inplace = True)
iBAQ_df = mq.impute_missing(iBAQ_df)

  return lib.map_infer(x.asobject, func)


In [518]:
#########################
#
# Map each column name to a corresponding label
#
#########################

"""
Input: dataframe
Output: List of strings representing the labels for each dataframe column
"""
def get_labels(df):
    columns = iBAQ_df.columns.values.tolist()
    labels = []

    for column in columns:
        key = next(key for key, value in organ_columns.items() if column in value)
        labels.append(key)
        
    return labels

In [519]:
iBAQ_df = iBAQ_df[organ_columns['Brain'] + organ_columns['Heart'] + organ_columns['Kidney'] + organ_columns['Liver'] + organ_columns['Lung']]

labels = get_labels(iBAQ_df)
print(iBAQ_df.columns.values.tolist())
print(labels)

['iBAQ Mouse_07_Brain', 'iBAQ Mouse_08_Brain', 'iBAQ Mouse_09_Brain', 'iBAQ Mouse_10_Brain', 'iBAQ Mouse_11_Brain', 'iBAQ Mouse_12_Brain', 'iBAQ Mouse_07_Heart', 'iBAQ Mouse_08_Heart', 'iBAQ Mouse_09_Heart', 'iBAQ Mouse_10_Heart', 'iBAQ Mouse_11_Heart', 'iBAQ Mouse_12_Heart', 'iBAQ Mouse_07_Kidney', 'iBAQ Mouse_08_Kidney', 'iBAQ Mouse_09_Kidney', 'iBAQ Mouse_10_Kidney', 'iBAQ Mouse_11_Kidney', 'iBAQ Mouse_12_Kidney', 'iBAQ Mouse_04_Liver', 'iBAQ Mouse_05_Liver', 'iBAQ Mouse_06_Liver', 'iBAQ Mouse_07_Liver', 'iBAQ Mouse_08_Liver', 'iBAQ Mouse_09_Liver', 'iBAQ Mouse_07_Lung', 'iBAQ Mouse_08_Lung', 'iBAQ Mouse_09_Lung', 'iBAQ Mouse_10_Lung', 'iBAQ Mouse_11_Lung', 'iBAQ Mouse_12_Lung']
['Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Kidney', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


In [520]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import preprocessing

# Transpose so that proteins are columns (components)
# Scale data
scaled_data = preprocessing.scale(iBAQ_df.T)

#########################
#
# Split data and labels into test and train groups
#
#########################

### Randomly split:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(iBAQ_df.T, labels, test_size=0.4, random_state=0, stratify=labels)

"""
X_train = np.concatenate([scaled_data[:4, :], scaled_data[6:10, :], scaled_data[12:16, :], scaled_data[18:22, :], scaled_data[24:28, :], scaled_data[30:34, :]], axis=0)

X_test = np.concatenate([scaled_data[4:6, :], scaled_data[10:12, :], scaled_data[16:18, :], scaled_data[22:24, :], scaled_data[28:30, :], scaled_data[34:, :]], axis=0)

y_train = labels[:4] + labels[6:10] + labels[12:16] + labels[18:22] + labels[24:28] + labels[30:34]
y_test = labels[4:6] + labels[10:12] + labels[16:18] + labels[22:24] + labels[28:30] + labels[34:]
"""
print(X_train.shape)
print(X_test.shape)

(18, 4399)
(12, 4399)


In [521]:
#########################
#
# Draw PCA graph of data
#
#########################

import pandas as pd
import matplotlib.pyplot as plt

base_dir = 'D:\\Images\\Classifier\\'
color_mapping = mq.map_colors(groups, organ_columns)
columns = iBAQ_df.columns.values.tolist()

pca, pca_data = mq.do_pca(iBAQ_df.copy())
per_var, labels = mq.make_scree_plot(pca, base_dir)
mq.draw_pca_graph(columns, pca_data, base_dir, color_mapping, per_var, labels)

In [522]:
pca = PCA(n_components=2)
pca.fit(X_train)
X_t_train = pca.transform(X_train)
X_t_test = pca.transform(X_test)

print(X_t_train.shape)
print(X_t_test.shape)
print(y_train)

(18, 2)
(12, 2)
['Lung', 'Liver', 'Heart', 'Heart', 'Kidney', 'Liver', 'Heart', 'Kidney', 'Kidney', 'Liver', 'Brain', 'Kidney', 'Brain', 'Brain', 'Brain', 'Lung', 'Lung', 'Heart']


## SVC

In [523]:
#########################
#
# Basic SVC Classification
#
#########################

from sklearn.metrics import accuracy_score

clf = SVC()
clf.fit(X_t_train, y_train)
y_pred = clf.predict(X_t_test)

print('score', accuracy_score(y_pred, y_test))
print('pred label', clf.predict(X_t_test))
print('actual', y_test)

score 0.166666666667
pred label ['Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart'
 'Heart' 'Heart' 'Heart']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']


In [524]:
from sklearn.svm import LinearSVC

#########################
#
# SVC variations
#
#########################

def try_SVC_models(X_train, y_train, X_test, y_test):
    C = 1.0  # SVM regularization parameter
    models = (SVC(kernel='linear', C=C),
              LinearSVC(C=C),
              SVC(kernel='rbf', gamma=0.7, C=C),
              SVC(kernel='poly', degree=3, C=C))

    # Fit all the models
    models = (clf.fit(X_train, y_train) for clf in models)

    for model in models:
        model_y_pred = model.predict(X_test)
        print('\n*** Model: ', model, '\n')
        print('score', accuracy_score(model_y_pred, y_test))
        print('pred label', model_y_pred)
        print('actual', y_test)

In [525]:
try_SVC_models(X_t_train, y_train, X_t_test, y_test)


*** Model:  SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

score 1.0
pred label ['Brain' 'Lung' 'Lung' 'Kidney' 'Heart' 'Liver' 'Liver' 'Heart' 'Liver'
 'Kidney' 'Lung' 'Brain']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']

*** Model:  LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0) 

score 0.583333333333
pred label ['Brain' 'Lung' 'Lung' 'Lung' 'Heart' 'Lung' 'Lung' 'Heart' 'Lung' 'Lung'
 'Lung' 'Brain']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']

*** Model:  SVC(C=1.0, cache_size=200, class_weight=No

## K Neighbors

In [526]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_t_train, y_train)
y_pred = knn.predict(X_t_test)

print('score', accuracy_score(y_pred, y_test))
print('pred', y_pred)
print('actual', y_test)

score 1.0
pred ['Brain' 'Lung' 'Lung' 'Kidney' 'Heart' 'Liver' 'Liver' 'Heart' 'Liver'
 'Kidney' 'Lung' 'Brain']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']


## Decision Tree

In [527]:
from sklearn import tree

decision_tree_clf = tree.DecisionTreeClassifier()
decision_tree_clf = decision_tree_clf.fit(X_t_train, y_train)
dt_pred = decision_tree_clf.predict(X_t_test)

print('score', accuracy_score(dt_pred, y_test))
print('pred', dt_pred)
print('actual', y_test)

score 0.583333333333
pred ['Brain' 'Lung' 'Lung' 'Kidney' 'Kidney' 'Kidney' 'Liver' 'Kidney' 'Kidney'
 'Kidney' 'Kidney' 'Brain']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']


# Testing classification of human lung
* TODO: map human and mouse proteins

In [528]:
#########################
#
# Map mouse proteinIDs to corresponding human ID
#
#########################

### Load correspondance data
import pandas as pd

mapping_file = r'D:\Human_Mouse_Mapping.txt'
mapping_df = pd.read_csv(mapping_file, usecols=['Matched Term', 'Symbol', 'Species'], sep='\t', lineterminator='\r', encoding = 'latin1')
mapping_df = mapping_df.replace(r'\n','', regex=True)

### Filter out entries not containing human in the "Species" column
mapping_df = mapping_df[mapping_df['Species'].isnull() | mapping_df['Species'].str.contains('Human')]
mapping_df.set_index('Matched Term', inplace=True)
print(mapping_df.head())

             Symbol          Species
Matched Term                        
ABL1_MOUSE     ABL1  Human,Mouse,Rat
AKT1_MOUSE     AKT1  Human,Mouse,Rat
HEM2_MOUSE     ALAD  Human,Mouse,Rat
HEM0_MOUSE    ALAS2  Human,Mouse,Rat
A4_MOUSE        APP  Human,Mouse,Rat


In [529]:
#########################
#
# Change mouse proteinIDs to common symbol
#
#########################

mouse_proteins = iBAQ_df.index.values.tolist()
mappings = mapping_df.to_dict('index')

iBAQ_df.reset_index(inplace=True)

for protein in mouse_proteins:
    to_replace = protein + '_MOUSE'
    if to_replace in mappings:
        mapping = mappings[to_replace]
        new_sym = mapping['Symbol']
        #print('Found one!', to_replace, new_sym)
        iBAQ_df.replace(protein, new_sym, inplace=True)
        
iBAQ_df.set_index('Majority protein IDs', inplace=True)
        

Found one! 1433B_MOUSE YWHAB
Found one! 1433E_MOUSE YWHAE
Found one! 1433F_MOUSE YWHAH
Found one! 1433G_MOUSE YWHAG
Found one! 1433S_MOUSE SFN
Found one! 1433T_MOUSE YWHAQ
Found one! 1433Z_MOUSE YWHAZ
Found one! 2A5A_MOUSE PPP2R5A
Found one! 2A5E_MOUSE PPP2R5E
Found one! 2AAA_MOUSE PPP2R1A
Found one! 2ABA_MOUSE PPP2R2A
Found one! 2ABD_MOUSE PPP2R2D
Found one! 2ABG_MOUSE PPP2R2C
Found one! 3BHS7_MOUSE HSD3B7
Found one! 3HAO_MOUSE HAAO
Found one! 3HIDH_MOUSE HIBADH
Found one! 41_MOUSE EPB41
Found one! 4F2_MOUSE SLC3A2
Found one! 5NT3A_MOUSE NT5C3A
Found one! 5NT3B_MOUSE NT5C3B
Found one! 5NTC_MOUSE NT5C2
Found one! 5NTD_MOUSE NT5E
Found one! 6PGD_MOUSE PGD
Found one! 6PGL_MOUSE PGLS
Found one! A16A1_MOUSE ALDH16A1
Found one! A16L1_MOUSE ATG16L1
Found one! A1CF_MOUSE A1CF
Found one! A2AP_MOUSE SERPINF2
Found one! A4_MOUSE APP
Found one! AAAD_MOUSE AADAC
Found one! AACS_MOUSE AACS
Found one! AADAT_MOUSE AADAT
Found one! AAK1_MOUSE AAK1
Found one! AAKB1_MOUSE PRKAB1
Found one! AAKG1_MOUSE P

Found one! BODG_MOUSE BBOX1
Found one! BOLA1_MOUSE BOLA1
Found one! BOLA3_MOUSE BOLA3
Found one! BORC6_MOUSE BORCS6
Found one! BORG4_MOUSE CDC42EP4
Found one! BORG5_MOUSE CDC42EP1
Found one! BPHL_MOUSE BPHL
Found one! BPNT1_MOUSE BPNT1
Found one! BRAF_MOUSE BRAF
Found one! BRE_MOUSE BRE
Found one! BRE1A_MOUSE RNF20
Found one! BRK1_MOUSE BRK1
Found one! BROX_MOUSE BROX
Found one! BRSK1_MOUSE BRSK1
Found one! BRSK2_MOUSE BRSK2
Found one! BSDC1_MOUSE BSDC1
Found one! BSN_MOUSE BSN
Found one! BSND_MOUSE BSND
Found one! BT3L4_MOUSE BTF3L4
Found one! BTBDH_MOUSE BTBD17
Found one! BTD_MOUSE BTD
Found one! BTF3_MOUSE BTF3
Found one! BUB3_MOUSE BUB3
Found one! BUP1_MOUSE UPB1
Found one! BZW1_MOUSE BZW1
Found one! C10_MOUSE C12orf57
Found one! C170B_MOUSE CEP170B
Found one! C1QB_MOUSE C1QB
Found one! C1QBP_MOUSE C1QBP
Found one! C1QR1_MOUSE CD93
Found one! C1TC_MOUSE MTHFD1
Found one! C1TM_MOUSE MTHFD1L
Found one! C2C2L_MOUSE C2CD2L
Found one! C2CD5_MOUSE C2CD5
Found one! C2D1A_MOUSE CC2D1A
Foun

Found one! CTL2_MOUSE SLC44A2
Found one! CTNA1_MOUSE CTNNA1
Found one! CTNA2_MOUSE CTNNA2
Found one! CTNA3_MOUSE CTNNA3
Found one! CTNB1_MOUSE CTNNB1
Found one! CTND1_MOUSE CTNND1
Found one! CTND2_MOUSE CTNND2
Found one! CTRO_MOUSE CIT
Found one! CTTB2_MOUSE CTTNBP2
Found one! CUBN_MOUSE CUBN
Found one! CUL1_MOUSE CUL1
Found one! CUL2_MOUSE CUL2
Found one! CUL3_MOUSE CUL3
Found one! CUL4A_MOUSE CUL4A
Found one! CUL4B_MOUSE CUL4B
Found one! CUL5_MOUSE CUL5
Found one! CUTA_MOUSE CUTA
Found one! CUTC_MOUSE CUTC
Found one! CX6A1_MOUSE COX6A1
Found one! CX6B1_MOUSE COX6B1
Found one! CX7A1_MOUSE COX7A1
Found one! CX7A2_MOUSE COX7A2
Found one! CXA1_MOUSE GJA1
Found one! CXAR_MOUSE CXADR
Found one! CXB1_MOUSE GJB1
Found one! CY1_MOUSE CYC1
Found one! CYB5_MOUSE CYB5A
Found one! CYB5B_MOUSE CYB5B
Found one! CYBP_MOUSE CACYBP
Found one! CYC_MOUSE CYCS
Found one! CYFP1_MOUSE CYFIP1
Found one! CYFP2_MOUSE CYFIP2
Found one! CYGB_MOUSE CYGB
Found one! CYH2_MOUSE CYTH2
Found one! CYH3_MOUSE CYTH3
Fou

Found one! F131B_MOUSE FAM131B
Found one! F136A_MOUSE FAM136A
Found one! F13A_MOUSE F13A1
Found one! F151A_MOUSE FAM151A
Found one! F162A_MOUSE FAM162A
Found one! F169A_MOUSE FAM169A
Found one! F16P1_MOUSE FBP1
Found one! F16P2_MOUSE FBP2
Found one! F175B_MOUSE FAM175B
Found one! F213A_MOUSE FAM213A
Found one! F234B_MOUSE FAM234B
Found one! F261_MOUSE PFKFB1
Found one! F8I2_MOUSE F8A1 (includes others)
Found one! F92A1_MOUSE FAM92A
Found one! FA49A_MOUSE FAM49A
Found one! FA49B_MOUSE FAM49B
Found one! FA65A_MOUSE FAM65A
Found one! FA98B_MOUSE FAM98B
Found one! FAAA_MOUSE FAH
Found one! FAAH1_MOUSE FAAH
Found one! FABD_MOUSE MCAT
Found one! FABP4_MOUSE FABP4
Found one! FABP5_MOUSE FABP5
Found one! FABP7_MOUSE FABP7
Found one! FABPH_MOUSE FABP3
Found one! FABPI_MOUSE FABP2
Found one! FABPL_MOUSE FABP1
Found one! FACE1_MOUSE ZMPSTE24
Found one! FAD1_MOUSE FLAD1
Found one! FADS1_MOUSE FADS1
Found one! FADS2_MOUSE FADS2
Found one! FAF1_MOUSE FAF1
Found one! FAF2_MOUSE FAF2
Found one! FAHD1_

Found one! ICT1_MOUSE MRPL58
Found one! IDE_MOUSE IDE
Found one! IDH3A_MOUSE IDH3A
Found one! IDHC_MOUSE IDH1
Found one! IDHG1_MOUSE IDH3G
Found one! IDHP_MOUSE IDH2
Found one! IF1A_MOUSE EIF1AX
Found one! IF1AX_MOUSE EIF1AY
Found one! IF2A_MOUSE EIF2S1
Found one! IF2B_MOUSE EIF2S2
Found one! IF2M_MOUSE MTIF2
Found one! IF2P_MOUSE EIF5B
Found one! IF4A1_MOUSE EIF4A1
Found one! IF4A2_MOUSE EIF4A2
Found one! IF4A3_MOUSE EIF4A3
Found one! IF4B_MOUSE EIF4B
Found one! IF4E_MOUSE EIF4E
Found one! IF4G1_MOUSE EIF4G1
Found one! IF4G2_MOUSE EIF4G2
Found one! IF4G3_MOUSE EIF4G3
Found one! IF4H_MOUSE EIF4H
Found one! IF5_MOUSE EIF5
Found one! IF5A1_MOUSE EIF5A
Found one! IF6_MOUSE EIF6
Found one! IGBP1_MOUSE IGBP1
Found one! IGHM_MOUSE IGHM
Found one! IGKC_MOUSE IGKC
Found one! IGLO5_MOUSE IGLON5
Found one! IGS21_MOUSE IGSF21
Found one! IGSF8_MOUSE IGSF8
Found one! ILEUA_MOUSE SERPINB1
Found one! ILF2_MOUSE ILF2
Found one! ILF3_MOUSE ILF3
Found one! ILK_MOUSE ILK
Found one! ILKAP_MOUSE ILKAP
Foun

Found one! MAT2B_MOUSE MAT2B
Found one! MATR3_MOUSE MATR3
Found one! MAVS_MOUSE MAVS
Found one! MBB1A_MOUSE MYBBP1A
Found one! MBD2_MOUSE MBD2
Found one! MBL2_MOUSE MBL2
Found one! MBLC2_MOUSE MBLAC2
Found one! MBP_MOUSE MBP
Found one! MBPHL_MOUSE MYBPHL
Found one! MCA3_MOUSE EEF1E1
Found one! MCAT_MOUSE SLC25A20
Found one! MCCA_MOUSE MCCC1
Found one! MCCB_MOUSE MCCC2
Found one! MCEE_MOUSE MCEE
Found one! MCES_MOUSE RNMT
Found one! MCTS1_MOUSE MCTS1
Found one! MCU_MOUSE MCU
Found one! MCUR1_MOUSE MCUR1
Found one! MD1L1_MOUSE MAD1L1
Found one! MDHC_MOUSE MDH1
Found one! MDHM_MOUSE MDH2
Found one! MDR3_MOUSE ABCB4
Found one! MECP2_MOUSE MECP2
Found one! MECR_MOUSE MECR
Found one! MEMO1_MOUSE MEMO1
Found one! MEP1A_MOUSE MEP1A
Found one! MEP1B_MOUSE MEP1B
Found one! MEP50_MOUSE WDR77
Found one! MESD_MOUSE MESDC2
Found one! MET7B_MOUSE METTL7B
Found one! METK1_MOUSE MAT1A
Found one! METK2_MOUSE MAT2A
Found one! MFAP4_MOUSE MFAP4
Found one! MFAP5_MOUSE MFAP5
Found one! MFGM_MOUSE MFGE8
Foun

Found one! NU155_MOUSE NUP155
Found one! NU1M_MOUSE MT-ND1
Found one! NU214_MOUSE NUP214
Found one! NU4M_MOUSE MT-ND4
Found one! NU5M_MOUSE MT-ND5
Found one! NUB1_MOUSE NUB1
Found one! NUBPL_MOUSE NUBPL
Found one! NUCB1_MOUSE NUCB1
Found one! NUCB2_MOUSE NUCB2
Found one! NUCG_MOUSE ENDOG
Found one! NUCKS_MOUSE NUCKS1
Found one! NUCL_MOUSE NCL
Found one! NUD12_MOUSE NUDT12
Found one! NUD13_MOUSE NUDT13
Found one! NUD14_MOUSE NUDT14
Found one! NUD16_MOUSE NUDT16
Found one! NUD19_MOUSE NUDT19
Found one! NUDC_MOUSE NUDC
Found one! NUDC2_MOUSE NUDCD2
Found one! NUDC3_MOUSE NUDCD3
Found one! NUDT3_MOUSE NUDT3
Found one! NUDT5_MOUSE NUDT5
Found one! NUDT7_MOUSE NUDT7
Found one! NUDT9_MOUSE NUDT9
Found one! NUFP2_MOUSE NUFIP2
Found one! NUMB_MOUSE NUMB
Found one! NUMBL_MOUSE NUMBL
Found one! NUP58_MOUSE NUP58
Found one! NUP62_MOUSE NUP62
Found one! NUP85_MOUSE NUP85
Found one! NUP93_MOUSE NUP93
Found one! NUP98_MOUSE NUP98
Found one! NXF1_MOUSE NXF1
Found one! OAT_MOUSE OAT
Found one! OBSCN_MO

Found one! PPP5_MOUSE PPP5C
Found one! PPP6_MOUSE PPP6C
Found one! PPR18_MOUSE PPP1R18
Found one! PPR1A_MOUSE PPP1R1A
Found one! PPR1B_MOUSE PPP1R1B
Found one! PPR21_MOUSE PPP1R21
Found one! PPR3A_MOUSE PPP1R3A
Found one! PPT1_MOUSE PPT1
Found one! PPTC7_MOUSE PPTC7
Found one! PR40A_MOUSE PRPF40A
Found one! PRAF3_MOUSE ARL6IP5
Found one! PRAX_MOUSE PRX
Found one! PRC2A_MOUSE PRRC2A
Found one! PRC2C_MOUSE PRRC2C
Found one! PRDBP_MOUSE PRKCDBP
Found one! PRDX1_MOUSE PRDX1
Found one! PRDX2_MOUSE PRDX2
Found one! PRDX3_MOUSE PRDX3
Found one! PRDX4_MOUSE PRDX4
Found one! PRDX5_MOUSE PRDX5
Found one! PRDX6_MOUSE PRDX6
Found one! PREB_MOUSE PREB
Found one! PRELP_MOUSE PRELP
Found one! PREP_MOUSE PITRM1
Found one! PREX1_MOUSE PREX1
Found one! PRIC2_MOUSE PRICKLE2
Found one! PRIO_MOUSE PRNP
Found one! PRKDC_MOUSE PRKDC
Found one! PRKRA_MOUSE PRKRA
Found one! PROD_MOUSE LOC102724788/PRODH
Found one! PROD2_MOUSE PRODH2
Found one! PROF1_MOUSE PFN1
Found one! PROF2_MOUSE PFN2
Found one! PROM1_MOUSE

Found one! RS28_MOUSE RPS28
Found one! RS29_MOUSE RPS29
Found one! RS3_MOUSE RPS3
Found one! RS30_MOUSE FAU
Found one! RS4X_MOUSE RPS4Y1
Found one! RS5_MOUSE RPS5
Found one! RS6_MOUSE RPS6
Found one! RS7_MOUSE RPS7
Found one! RS8_MOUSE RPS8
Found one! RS9_MOUSE RPS9
Found one! RSSA_MOUSE RPSA
Found one! RSU1_MOUSE RSU1
Found one! RT02_MOUSE MRPS2
Found one! RT05_MOUSE MRPS5
Found one! RT06_MOUSE MRPS6
Found one! RT07_MOUSE MRPS7
Found one! RT09_MOUSE MRPS9
Found one! RT18B_MOUSE MRPS18B
Found one! RT22_MOUSE MRPS22
Found one! RT23_MOUSE MRPS23
Found one! RT27_MOUSE MRPS27
Found one! RT28_MOUSE MRPS28
Found one! RT29_MOUSE DAP3
Found one! RT30_MOUSE MRPS30
Found one! RT31_MOUSE MRPS31
Found one! RT34_MOUSE MRPS34
Found one! RT35_MOUSE MRPS35
Found one! RT36_MOUSE MRPS36
Found one! RT4I1_MOUSE RTN4IP1
Found one! RTCA_MOUSE RTCA
Found one! RTCB_MOUSE RTCB
Found one! RTN1_MOUSE RTN1
Found one! RTN2_MOUSE RTN2
Found one! RTN3_MOUSE RTN3
Found one! RTN4_MOUSE RTN4
Found one! RU17_MOUSE SNRNP

Found one! SRS10_MOUSE SRSF10
Found one! SRSF1_MOUSE SRSF1
Found one! SRSF3_MOUSE SRSF3
Found one! SRSF4_MOUSE SRSF4
Found one! SRSF6_MOUSE SRSF6
Found one! SRSF7_MOUSE SRSF7
Found one! SRSF9_MOUSE SRSF9
Found one! SSBP_MOUSE SSBP1
Found one! SSDH_MOUSE ALDH5A1
Found one! SSFA2_MOUSE SSFA2
Found one! SSRA_MOUSE SSR1
Found one! SSRD_MOUSE SSR4
Found one! SSRP1_MOUSE SSRP1
Found one! ST1B1_MOUSE SULT1B1
Found one! ST1C2_MOUSE SULT1C2
Found one! ST2A1_MOUSE SULT2A1
Found one! ST32C_MOUSE STK32C
Found one! ST38L_MOUSE STK38L
Found one! ST4A1_MOUSE SULT4A1
Found one! STA13_MOUSE STARD13
Found one! STAG2_MOUSE STAG2
Found one! STAM1_MOUSE STAM
Found one! STAR5_MOUSE STARD5
Found one! STAT1_MOUSE STAT1
Found one! STAT3_MOUSE STAT3
Found one! STAU1_MOUSE STAU1
Found one! STB5L_MOUSE STXBP5L
Found one! STBD1_MOUSE STBD1
Found one! STEA4_MOUSE STEAP4
Found one! STIM1_MOUSE STIM1
Found one! STIM2_MOUSE STIM2
Found one! STIP1_MOUSE STIP1
Found one! STK24_MOUSE STK24
Found one! STK3_MOUSE STK3
Foun

Found one! VDAC1_MOUSE VDAC1
Found one! VDAC2_MOUSE VDAC2
Found one! VDAC3_MOUSE VDAC3
Found one! VGFR2_MOUSE KDR
Found one! VGLU1_MOUSE SLC17A7
Found one! VGLU2_MOUSE SLC17A6
Found one! VIAAT_MOUSE SLC32A1
Found one! VIGLN_MOUSE HDLBP
Found one! VILI_MOUSE VIL1
Found one! VILL_MOUSE VILL
Found one! VIME_MOUSE VIM
Found one! VINC_MOUSE VCL
Found one! VINEX_MOUSE SORBS3
Found one! VISL1_MOUSE VSNL1
Found one! VKGC_MOUSE GGCX
Found one! VMA5A_MOUSE VWA5A
Found one! VNN1_MOUSE VNN1
Found one! VP13A_MOUSE VPS13A
Found one! VP13C_MOUSE VPS13C
Found one! VP26A_MOUSE VPS26A
Found one! VP26B_MOUSE VPS26B
Found one! VP33A_MOUSE VPS33A
Found one! VP33B_MOUSE VPS33B
Found one! VP37C_MOUSE VPS37C
Found one! VPP1_MOUSE ATP6V0A1
Found one! VPP4_MOUSE ATP6V0A4
Found one! VPS11_MOUSE VPS11
Found one! VPS16_MOUSE VPS16
Found one! VPS18_MOUSE VPS18
Found one! VPS25_MOUSE VPS25
Found one! VPS28_MOUSE VPS28
Found one! VPS29_MOUSE VPS29
Found one! VPS35_MOUSE VPS35
Found one! VPS36_MOUSE VPS36
Found one! V

In [65]:
#########################
#
# Load and clean human data
#
#########################

human_lung_file = r'F:\Human_Lung_Raw_Files\LungMAP\combined\txt\human_lung_proteinGroups.txt'
human_groups = ['Human_Lung']

human_lung_df = mq.load_df(human_lung_file)
human_lung_df = mq.clean_weakly_identified(human_lung_df)
human_lung_df = mq.remove_dup_proteinIDs(human_lung_df)
        
human_lung_iBAQ_df = mq.slice_by_column(human_lung_df, 'protein', 'iBAQ ') 
    
human_lung_organ_columns = {} # 'Liver': ['iBAQ 04_Liver', 'iBAQ 05_Liver', ...]
human_lung_organ_counts = {} # 'Liver': 
    
human_lung_iBAQ_df = mq.filter_low_observed(human_lung_iBAQ_df, human_groups, human_lung_organ_columns, human_lung_organ_counts)

human_lung_iBAQ_df['Majority protein IDs'] = human_lung_iBAQ_df['Majority protein IDs'].str[:-6]
mq.log2_normalize(human_lung_iBAQ_df)
mq.median_normalize(human_lung_iBAQ_df)

human_lung_iBAQ_df.set_index('Majority protein IDs', inplace = True)
human_lung_iBAQ_df = mq.impute_missing(human_lung_iBAQ_df)

  return lib.map_infer(x.asobject, func)


In [66]:
#########################
#
# Join mouse data to human data so that human_t_test has same shape as trained data
#
#########################

combined_df = iBAQ_df.join(human_lung_iBAQ_df)
combined_df = mq.impute_missing(combined_df)
all_columns = combined_df.columns.values.tolist()

combined_pca, combined_pca_data = mq.do_pca(combined_df)

# Add mapping of 'Human_Lung' to all human lung columns, and new organ group to groups
organ_columns.update(human_lung_organ_columns)
groups.append('Human_Lung')

# Draw scree plot of all human and mouse tissues
combined_dir = base_dir + 'Human_'
combined_color_mapping = mq.map_colors(groups, organ_columns)

combined_per_var, combined_labels = mq.make_scree_plot(combined_pca, combined_dir)
mq.draw_pca_graph(all_columns, combined_pca_data, combined_dir, combined_color_mapping, combined_per_var, combined_labels)

combined_df.drop(combined_df.columns[:30], axis=1, inplace=True) # Drop mouse data

In [67]:
#########################
#
# Make the prediction
#
#########################

scaled_human_data = preprocessing.scale(combined_df.T)
human_t_test = pca.transform(scaled_human_data)

human_pred = knn.predict(human_t_test)
print('pred', human_pred)

pred ['Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart'
 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart'
 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart'
 'Heart' 'Heart']




In [68]:
### Randomly select train and test groups

## Random Forest

## Naive Bayes/LDA

# Classify using Peptide Data

In [69]:
#########################
#
# Load and clean peptide data
#
#########################

peptide_file = "D:\peptides.txt"

peptide_df = mq.load_df(peptide_file)
peptide_df = mq.slice_by_column(peptide_df, 'peptide', 'LFQ')
peptide_df.columns = rename_columns(peptide_df, 'Adult', 'Mouse')

peptide_organ_columns = {}
peptide_organ_counts = {}
peptide_df = mq.filter_low_observed(peptide_df, groups, peptide_organ_columns, peptide_organ_counts)
mq.log2_normalize(peptide_df)
mq.median_normalize(peptide_df)

peptide_df = peptide_df.replace(r'\n','', regex=True)
peptide_df.set_index('Sequence', inplace = True)
peptide_df = mq.impute_missing(peptide_df)

peptide_labels = get_labels(peptide_df)

  return lib.map_infer(x.asobject, func)


In [70]:
#########################
#
# Split data and labels into test and train groups
#
#########################

scaled_peptide_data = preprocessing.scale(peptide_df.T)

### Randomly split:
peptide_X_train, peptide_X_test, peptide_y_train, peptide_y_test = cross_validation.train_test_split(peptide_df.T, peptide_labels, test_size=0.4, random_state=0, stratify=peptide_labels)

peptide_pca = PCA(n_components=4)
peptide_pca.fit(peptide_X_train)
peptide_X_t_train = peptide_pca.transform(peptide_X_train)
peptide_X_t_test = peptide_pca.transform(peptide_X_test)

print(peptide_X_t_train.shape)
print(peptide_X_t_test.shape)
print(peptide_y_train)

(18, 4)
(12, 4)
['Lung', 'Liver', 'Heart', 'Heart', 'Kidney', 'Liver', 'Heart', 'Kidney', 'Kidney', 'Liver', 'Brain', 'Kidney', 'Brain', 'Brain', 'Brain', 'Lung', 'Lung', 'Heart']


In [71]:
#########################
#
# Draw PCA plots for peptide data
#
#########################

peptide_dir = base_dir + 'Mouse_Peptide_'
peptide_color_mapping = mq.map_colors(groups, peptide_organ_columns)
columns = peptide_df.columns.values.tolist()

peptide_pca, peptide_pca_data = mq.do_pca(peptide_df.copy())
peptide_per_var, peptide_labels = mq.make_scree_plot(peptide_pca, peptide_dir)
mq.draw_pca_graph(columns, peptide_pca_data, peptide_dir, peptide_color_mapping, peptide_per_var, peptide_labels)


In [72]:
#########################
#
# Attempt to classify using SVC algorithm
#
#########################

peptide_clf = SVC()
peptide_clf.fit(peptide_X_t_train, peptide_y_train)
peptide_y_pred = peptide_clf.predict(peptide_X_t_test)

print('score', accuracy_score(peptide_y_pred, peptide_y_test))
print('pred label', peptide_y_pred)
print('actual', peptide_y_test)

score 0.166666666667
pred label ['Kidney' 'Kidney' 'Kidney' 'Kidney' 'Kidney' 'Kidney' 'Kidney' 'Kidney'
 'Kidney' 'Kidney' 'Kidney' 'Kidney']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']


In [73]:
#########################
#
# Attempt to classify using 4 SVC variations 
#
#########################

try_SVC_models(peptide_X_t_train, peptide_y_train, peptide_X_t_test, peptide_y_test)


*** Model:  SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

score 0.166666666667
pred label ['Kidney' 'Kidney' 'Kidney' 'Kidney' 'Kidney' 'Kidney' 'Kidney' 'Kidney'
 'Kidney' 'Kidney' 'Kidney' 'Kidney']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']

*** Model:  LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0) 

score 0.0
pred label ['Kidney' 'Heart' 'Liver' 'Heart' 'Kidney' 'Kidney' 'Kidney' 'Brain'
 'Kidney' 'Brain' 'Heart' 'Liver']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']

*** Model:  SVC(C=1.0, cache

In [74]:
#########################
#
# Attempt to classify using K Neighbors algorithm
#
#########################

peptide_knn = KNeighborsClassifier()
peptide_knn.fit(peptide_X_t_train, peptide_y_train)
peptide_y_pred = peptide_knn.predict(peptide_X_t_test)

print('score', accuracy_score(peptide_y_pred, peptide_y_test))
print('pred', peptide_y_pred)
print('actual', peptide_y_test)

score 0.0833333333333
pred ['Lung' 'Liver' 'Brain' 'Liver' 'Lung' 'Kidney' 'Lung' 'Brain' 'Brain'
 'Brain' 'Heart' 'Brain']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']


In [90]:
#########################
#
# Attempt to classify using decision tree
#
#########################

peptide_dt_clf = tree.DecisionTreeClassifier()
peptide_dt_clf = peptide_dt_clf.fit(peptide_X_t_train, peptide_y_train)
peptide_dt_pred = peptide_dt_clf.predict(peptide_X_t_test)

print('score', accuracy_score(peptide_dt_pred, peptide_y_test))
print('pred', peptide_dt_pred)
print('actual', peptide_y_test)

score 0.166666666667
pred ['Lung' 'Liver' 'Lung' 'Liver' 'Lung' 'Lung' 'Lung' 'Heart' 'Lung' 'Heart'
 'Liver' 'Kidney']
actual ['Brain', 'Lung', 'Lung', 'Kidney', 'Heart', 'Liver', 'Liver', 'Heart', 'Liver', 'Kidney', 'Lung', 'Brain']
