In [1]:
from glycan import glycan
import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

In [2]:
def read_egg_data():
    g = glycan()
    # Read data (CSV files) as pandas data frames. Each file is a new line.
    # Note that each file can have multiple experiments, called "tabs" here.
    # You can specify how many tabs you want to read using the totalTabs variable.

    # The "wh" parameter simply controls the format of the CSV file.
    # The 'old' format is slightly different from the 'new' format, but has the same information
    numberedExcel1, b = g.getData('Data/H1N1_EGG/old_sheet1.csv', wh='old', totalTabs=3, startExp=1)
    numberedExcel2, b = g.getData('Data/H1N1_EGG/old_sheet2.csv', wh='old', totalTabs=4, startExp=(b+1))
    numberedExcel3, b = g.getData('Data/H1N1_EGG/old_sheet3.csv', wh='old', totalTabs=4, startExp=(b+1))
    numberedExcel4, b = g.getData('Data/H1N1_EGG/new_sheet1.csv', totalTabs=2, startExp=(b+1))
    numberedExcel5, b = g.getData('Data/H1N1_EGG/new_sheet2.csv', totalTabs=2, startExp=(b+1))
    numberedExcel6, b = g.getData('Data/H1N1_EGG/new_sheet3.csv', totalTabs=2, startExp=(b+1))
    numberedExcel7, b = g.getData('Data/H1N1_EGG/new_sheet4.csv', totalTabs=4, startExp=(b+1))
    numberedExcel8, b = g.getData('Data/H1N1_EGG/new_sheet5.csv', totalTabs=4, startExp=(b+1))
    numberedExcel9, b = g.getData('Data/H1N1_EGG/new_sheet6.csv', totalTabs=4, startExp=(b+1))
    numberedExcel10, b = g.getData('Data/H1N1_EGG/egg_virus_dilutions_secondTab_updated.csv', totalTabs=2, startExp=(b+1))
    
    
    # Now combine all the data frames
    numberedExcel_temp = pd.concat([numberedExcel1, numberedExcel2, numberedExcel3, numberedExcel4, numberedExcel5, numberedExcel6, numberedExcel7, numberedExcel8, numberedExcel9, numberedExcel10], axis=0)
    # Removing the blocks with bad data. These were hand-picked by visual inspection of experimental results
    egg = numberedExcel_temp[~((numberedExcel_temp.SubArr == 23) | (numberedExcel_temp.SubArr == 25) | (numberedExcel_temp.SubArr == 26))]
    egg.reset_index(drop=True, inplace=True)
    # Scaling the mean viral fluorescence by glycan density.
    egg['MVF'] = egg.MVF/egg.GlycDen
    return egg

In [3]:
# This function identifies true binders, true non-binders, false binders, and false non-binders,
#       given the train, test, and predictions.

def getResults(X_train, Y_train, X_test, Y_test, preds_train, preds_test): 
    X_train_2 = X_train.copy()
    X_test_2 = X_test.copy()
    
    X_train_2['Label'] = Y_train
    X_train_2['Pred'] = preds_train
    X_train_2['Correct'] = X_train_2.Label == X_train_2.Pred
    X_train_2['Correct_2'] = X_train_2['Correct']
    X_train_2.loc[(X_train_2.Correct) & (X_train_2.Label==1.0), 'Correct_2'] = 'True binder'
    X_train_2.loc[(X_train_2.Correct) & (X_train_2.Label==0.0), 'Correct_2'] = 'True non-binder'
    X_train_2.loc[~(X_train_2.Correct) & (X_train_2.Label==1.0), 'Correct_2'] = 'False non-binder'
    X_train_2.loc[~(X_train_2.Correct) & (X_train_2.Label==0.0), 'Correct_2'] = 'False binder'

    X_test_2['Label'] = Y_test
    X_test_2['Pred'] = preds_test
    X_test_2['Correct'] = X_test_2.Label == X_test_2.Pred
    X_test_2['Correct_2'] = X_test_2['Correct']
    X_test_2.loc[(X_test_2.Correct) & (X_test_2.Label==1.0), 'Correct_2'] = 'True binder'
    X_test_2.loc[(X_test_2.Correct) & (X_test_2.Label==0.0), 'Correct_2'] = 'True non-binder'
    X_test_2.loc[~(X_test_2.Correct) & (X_test_2.Label==1.0), 'Correct_2'] = 'False non-binder'
    X_test_2.loc[~(X_test_2.Correct) & (X_test_2.Label==0.0), 'Correct_2'] = 'False binder'
    
    X_all = pd.concat([X_train_2, X_test_2])
    return X_all

In [4]:
egg = read_egg_data()

g = glycan()
X_train, Y_train, X_test, Y_test, bindData = g.getTrainTest(egg, cutoff = 0.05, test_size=0.33, dropDP=True)
clf_egg = svm.SVC(verbose=False)
clf_egg.fit(X_train, Y_train)
preds_train = clf_egg.predict(X_train)
preds_test = clf_egg.predict(X_test)

egg_self = getResults(X_train, Y_train, X_test, Y_test, preds_train, preds_test)

For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [5]:
def read_MDCK_data():
    # Create an instance of the glycan class
    g2 = glycan()

    # Read data (CSV files) as pandas data frames. Each file is a new line.
    # Note that each file can have multiple experiments, called "tabs" here.
    # You can specify how many tabs you want to read using the totalTabs variable.
    numberedExcel2, b = g2.getData('Data/H1N1_MDCK/MDCK_sheet2.csv', totalTabs=1, startExp=1)
    numberedExcel3, b = g2.getData('Data/H1N1_MDCK/MDCK_sheet3.csv', totalTabs=1, startExp=(b+1))
    numberedExcel4, b = g2.getData('Data/H1N1_MDCK/MDCK_sheet4.csv', totalTabs=2, startExp=(b+1))

    # Now combine all the data frames
    numberedExcel_temp = pd.concat([numberedExcel2, numberedExcel3, numberedExcel4], axis=0)
    mdck = numberedExcel_temp

    mdck.reset_index(drop=True, inplace=True)
    # Scaling the mean viral fluorescence by glycan density.
    mdck['MVF'] = mdck.MVF/mdck.GlycDen
    return mdck

In [6]:
mdck = read_MDCK_data()

g2 = glycan()
X_train_mdck, Y_train_mdck, X_test_mdck, Y_test_mdck, bindData_mdck = g2.getTrainTest(mdck, cutoff = 0.03, dropDP=True, test_size=0.33)
clf_mdck = svm.SVC(verbose=False)
clf_mdck.fit(X_train_mdck, Y_train_mdck)
preds_train_mdck = clf_mdck.predict(X_train_mdck)
preds_test_mdck = clf_mdck.predict(X_test_mdck)

mdck_self = getResults(X_train_mdck, Y_train_mdck, X_test_mdck, Y_test_mdck, preds_train_mdck, preds_test_mdck)

For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header
For now, we are skipping the header




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [7]:
# Now, we do cross-model for egg
preds_train_cross_egg = clf_mdck.predict(X_train)
preds_test_cross_egg = clf_mdck.predict(X_test)
egg_cross = getResults(X_train, Y_train, X_test, Y_test, preds_train_cross_egg, preds_test_cross_egg)

In [8]:
# This creates a data frame which has results for both self-model as well as cross-model for egg
see1 = egg_cross[['Pred']]
see1.rename({'Pred': 'Pred_cross'}, axis=1, inplace=True)
egg_final = egg_self.join(see1)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [9]:
# Now, we do cross-model for MDCK
preds_train_cross_mdck = clf_egg.predict(X_train_mdck)
preds_test_cross_mdck = clf_egg.predict(X_test_mdck)
mdck_cross = getResults(X_train_mdck, Y_train_mdck, X_test_mdck, Y_test_mdck, preds_train_cross_mdck, preds_test_cross_mdck)

In [10]:
# This creates a data frame which has results for both self-model as well as cross-model for MDCK
see2 = mdck_cross[['Pred']]
see2.rename({'Pred': 'Pred_cross'}, axis=1, inplace=True)
mdck_final = mdck_self.join(see2)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
# Now label "Lost", "Gained", and Conserved for egg

egg = egg[egg.GlycType != 'Lac']
egg['Status'] = 'Others'

egg.loc[((egg_final.Correct_2 == 'True binder') & (egg_final.Pred_cross == 1.0)), 'Status'] = 'Conserved'
egg.loc[((egg_final.Correct_2 == 'True non-binder') & (egg_final.Pred_cross == 1.0)), 'Status'] = 'Lost in egg'
egg.loc[((egg_final.Correct_2 == 'True binder') & (egg_final.Pred_cross == 0.0)), 'Status'] = 'Gained in egg'

eggcopy = egg.copy()
eggcopy[['Label', 'Pred', 'Pred_cross']] = egg_final[['Label', 'Pred', 'Pred_cross']]

In [12]:
# Now label "Lost", "Gained", and "Conserved for MDCK"

mdck = mdck[mdck.GlycType != 'Lac']
mdck['Status'] = 'Others'

mdck.loc[((mdck_final.Correct_2 == 'True binder') & (mdck_final.Pred_cross == 1.0), 'Status')] = 'Conserved'
mdck.loc[((mdck_final.Correct_2 == 'True non-binder') & (mdck_final.Pred_cross == 1.0), 'Status')] = 'Lost in MDCK'
mdck.loc[((mdck_final.Correct_2 == 'True binder') & (mdck_final.Pred_cross == 0.0), 'Status')] = 'Gained in MDCK'

mdckcopy = mdck.copy()
mdckcopy[['Label', 'Pred', 'Pred_cross']] = mdck_final[['Label', 'Pred', 'Pred_cross']]

In [13]:
# Now, create a new data frame for H1N1_EGG which has 3 labeled columns, which represent, respectively,
#      experimental results (binder/non-binder), self-model prediction, and cross-model predictions

egg_Mar21 = eggcopy.drop(['SubArr', 'ExpNum', 'FileName'], axis=1)
egg_Mar21['Experiment'] = 'Non-binder'
egg_Mar21['Self-model'] = 'Non-binder'
egg_Mar21['Cross-model'] = 'Non-binder'
egg_Mar21.loc[(egg_Mar21.Label == 1), 'Experiment'] = 'Binder'
egg_Mar21.loc[(egg_Mar21.Pred == 1), 'Self-model'] = 'Binder'
egg_Mar21.loc[(egg_Mar21.Pred_cross == 1), 'Cross-model'] = 'Binder'
egg_Mar21_final = egg_Mar21.drop(['Label', 'Pred', 'Pred_cross'], axis=1)

In [14]:
# Now, create a new data frame for H1N1_MDCK which has 3 labeled columns, which represent, respectively,
#      experimental results (binder/non-binder), self-model prediction, and cross-model predictions

mdck_Mar21 = mdckcopy.drop(['SubArr', 'ExpNum', 'FileName'], axis=1)
mdck_Mar21['Experiment'] = 'Non-binder'
mdck_Mar21['Self-model'] = 'Non-binder'
mdck_Mar21['Cross-model'] = 'Non-binder'
mdck_Mar21.loc[(mdck_Mar21.Label == 1), 'Experiment'] = 'Binder'
mdck_Mar21.loc[(mdck_Mar21.Pred == 1), 'Self-model'] = 'Binder'
mdck_Mar21.loc[(mdck_Mar21.Pred_cross == 1), 'Cross-model'] = 'Binder'
mdck_Mar21_final = mdck_Mar21.drop(['Label', 'Pred', 'Pred_cross'], axis=1)

In [15]:
# Save as CSV
egg_Mar21_final.to_csv("Results/egg_SVM_data.csv", index=False)
mdck_Mar21_final.to_csv("Results/mdck_SVM_data.csv", index=False)

In [16]:
eggcopy2 = egg.copy()
eggcopy2 = eggcopy2[eggcopy2.GlycType != 'Lac']

eggcopy2['Status'] = egg_final['Correct_2']
eggcopy3 = eggcopy2.drop(['FileName', 'SubArr', 'ExpNum'], axis=1)
eggcopy3['MVF'] = eggcopy3['MVF'].apply(lambda x: round(x, 6))
eggcopy3.to_csv("Results/egg_selfModel.csv", index=False)

In [17]:
mdckcopy2 = mdck.copy()
mdckcopy2 = mdckcopy2[mdckcopy2.GlycType != 'Lac']

mdckcopy2['Status'] = mdck_final['Correct_2']
mdckcopy3 = mdckcopy2.drop(['FileName', 'SubArr', 'ExpNum'], axis=1)
mdckcopy3['MVF'] = mdckcopy3['MVF'].apply(lambda x: round(x,6))
mdckcopy3.to_csv("Results/mdck_selfModel.csv", index=False)