In [157]:
import MaxQuant_Postprocessing_Functions as mq

In [158]:
#########################
#
# Load and clean data
#
#########################

file = "D:\proteinGroups.txt"

df = mq.load_df(file)
df = mq.clean_weakly_identified(df)
df = mq.remove_dup_proteinIDs(df)

iBAQ_df = mq.slice_by_column(df, 'iBAQ ')

groups = ['Brain', 'Heart', 'Kidney', 'Liver', 'Lung']
organ_columns = {} # 'Liver': ['iBAQ 04_Liver', 'iBAQ 05_Liver', ...]
organ_counts = {} # 'Liver': 
    
iBAQ_df = mq.filter_low_observed(iBAQ_df, groups, organ_columns, organ_counts)

In [159]:
#########################
#
# Normalize data and impute missing values with data frame minimum/2
#
#########################

mq.log2_normalize(iBAQ_df)
mq.median_normalize(iBAQ_df)

iBAQ_df.set_index('Majority protein IDs', inplace = True)
iBAQ_df = mq.impute_missing(iBAQ_df)

  return lib.map_infer(x.asobject, func)


In [160]:
#########################
#
# Map each column name to a corresponding label
#
#########################

def get_labels(df):
    columns = iBAQ_df.columns.values.tolist()
    labels = []

    for column in columns:
        key = next(key for key, value in organ_columns.items() if column in value)
        labels.append(key)
        
    return labels

In [161]:
iBAQ_df = iBAQ_df[organ_columns['Brain'] + organ_columns['Heart'] + organ_columns['Liver'] + organ_columns['Lung']]

labels = get_labels(iBAQ_df)
print(iBAQ_df.columns.values.tolist())
print(labels)

['iBAQ Adult_07_Brain', 'iBAQ Adult_08_Brain', 'iBAQ Adult_09_Brain', 'iBAQ Adult_10_Brain', 'iBAQ Adult_11_Brain', 'iBAQ Adult_12_Brain', 'iBAQ Adult_07_Heart', 'iBAQ Adult_08_Heart', 'iBAQ Adult_09_Heart', 'iBAQ Adult_10_Heart', 'iBAQ Adult_11_Heart', 'iBAQ Adult_12_Heart', 'iBAQ Adult_04_Liver', 'iBAQ Adult_05_Liver', 'iBAQ Adult_06_Liver', 'iBAQ Adult_07_Liver', 'iBAQ Adult_08_Liver', 'iBAQ Adult_09_Liver', 'iBAQ Adult_07_Lung', 'iBAQ Adult_08_Lung', 'iBAQ Adult_09_Lung', 'iBAQ Adult_10_Lung', 'iBAQ Adult_11_Lung', 'iBAQ Adult_12_Lung']
['Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Brain', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Heart', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Liver', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung', 'Lung']


In [162]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn import cross_validation
from sklearn import preprocessing

# Transpose so that proteins are columns (components)
# Scale data
scaled_data = preprocessing.scale(iBAQ_df.T)

### Split data and labels into test and train groups
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(iBAQ_df.T, labels, test_size=0.4, random_state=0)
X_train = np.concatenate([scaled_data[:4, :], scaled_data[6:10, :], scaled_data[12:16, :], scaled_data[18:22, :], scaled_data[24:28, :], scaled_data[30:34, :]], axis=0)

X_test = np.concatenate([scaled_data[4:6, :], scaled_data[10:12, :], scaled_data[16:18, :], scaled_data[22:24, :], scaled_data[28:30, :], scaled_data[34:, :]], axis=0)

y_train = labels[:4] + labels[6:10] + labels[12:16] + labels[18:22] + labels[24:28] + labels[30:34]
y_test = labels[4:6] + labels[10:12] + labels[16:18] + labels[22:24] + labels[28:30] + labels[34:]

print(y_train)
print(y_test)

['Brain', 'Brain', 'Brain', 'Brain', 'Heart', 'Heart', 'Heart', 'Heart', 'Liver', 'Liver', 'Liver', 'Liver', 'Lung', 'Lung', 'Lung', 'Lung']
['Brain', 'Brain', 'Heart', 'Heart', 'Liver', 'Liver', 'Lung', 'Lung']




In [163]:
pca = PCA(n_components=2)
pca.fit(X_train)
X_t_train = pca.transform(X_train)
X_t_test = pca.transform(X_test)

print(X_t_train)
print(X_t_test)

[[-44.02830957 -32.38468976]
 [-42.64874034 -30.8399149 ]
 [-43.47990723 -31.43040131]
 [-42.54619068 -30.24432276]
 [ 53.82209977 -30.87401567]
 [ 57.40606365 -32.13785411]
 [ 52.85921902 -30.67684518]
 [ 48.7881994  -28.69153182]
 [ 15.01460365  48.05255756]
 [ 14.89971339  46.92326144]
 [ 11.97137624  46.66677303]
 [ 12.02444951  46.22532149]
 [-25.02745466  15.76927376]
 [-20.23481461  14.20228527]
 [-25.54897231  15.14767983]
 [-23.27133522  14.29242312]]
[[-41.70952987 -28.27218186]
 [-40.89201608 -27.47418082]
 [ 41.92079718 -25.36239839]
 [ 41.90004202 -24.23798199]
 [ 10.30863459  44.59129202]
 [ 11.51147487  43.12494578]
 [-23.36283839  12.74249821]
 [-20.27771134  13.08733572]]


In [150]:
clf = SVC()
clf.fit(X_t_train, y_train)
print('score', clf.score(X_t_test, y_test))
print('pred label', clf.predict(X_t_test))

score 0.5
pred label ['Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Heart' 'Lung' 'Lung']
