# Notebook for testing gene pairs in Van Rooij, MAYO and GSE datasets

In [75]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import umap
from skrebate import ReliefF
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from itertools import combinations
from matplotlib_venn import venn3

sns.set_theme(style='white', font_scale=1.8)

plt.rcParams['figure.figsize'] = (5, 5)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 12
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['figure.autolayout'] = True

## Testing performance of genes in other datasets

In [76]:
N_reliefF = 500

In [77]:
genes_VR = pd.read_csv('../Output/genes_VR.csv', header=None)[0].values
df_normalized_VR = pd.read_csv('../Output/df_normalized_VR.csv', index_col=0)
labels_VR = pd.read_csv('../Output/labels_VR.csv', header=None)[0].values
df_relief_VR = pd.read_csv('../Output/df_relief_'+str(N_reliefF)+'_VR.csv', index_col = 0)
genes_relief_VR = pd.read_csv('../Output/genes_'+str(N_reliefF)+'_VR.csv', header=None)[0].values

In [78]:
genes_MAYO = pd.read_csv('../Output/genes_MAYO.csv', header=None)[0].values
df_normalized_MAYO = pd.read_csv('../Output/df_normalized_MAYO.csv', index_col=0)
labels_MAYO = pd.read_csv('../Output/labels_MAYO.csv', header=None)[0].values
df_relief_MAYO = pd.read_csv('../Output/df_relief_'+str(N_reliefF)+'_MAYO.csv', index_col = 0)
genes_relief_MAYO = pd.read_csv('../Output/genes_'+str(N_reliefF)+'_MAYO.csv', header=None)[0].values

In [79]:
genes_GSE = pd.read_csv('../Output/genes_GSE.csv', header=None)[0].values
df_normalized_GSE = pd.read_csv('../Output/df_normalized_GSE.csv', index_col=0)
labels_GSE = pd.read_csv('../Output/labels_GSE.csv', header=None)[0].values
df_relief_GSE = pd.read_csv('../Output/df_relief_'+str(N_reliefF)+'_GSE.csv', index_col = 0)
genes_relief_GSE = pd.read_csv('../Output/genes_'+str(N_reliefF)+'_GSE.csv', header=None)[0].values

In [80]:
genepairs_relief_VR = pd.DataFrame(combinations(genes_relief_VR, 2), columns=['Gene 1','Gene 2'])

In [81]:
genepairs_relief_VR

Unnamed: 0,Gene 1,Gene 2
0,RBP1,TRPM1
1,RBP1,BTBD17
2,RBP1,DDOST
3,RBP1,C1orf61
4,RBP1,WNT7B
...,...,...
124745,SPRYD3,MRPL24
124746,SPRYD3,NME1
124747,ME1,MRPL24
124748,ME1,NME1


In [90]:
genepairs_relief_VR.values

array([['RBP1', 'TRPM1'],
       ['RBP1', 'BTBD17'],
       ['RBP1', 'DDOST'],
       ...,
       ['ME1', 'MRPL24'],
       ['ME1', 'NME1'],
       ['MRPL24', 'NME1']], dtype=object)

In [82]:
# Performance of VR gene pairs in VR dataset
scores_cv_VR_to_VR = []
scores_training_VR_to_VR = []
for pair in tqdm(genepairs_relief_VR.values.tolist()):
    if set(pair).issubset(genes_VR):
        df_to_train = df_normalized_VR[pair]

        clf = SVC(kernel='linear', C=100)
        clf.fit(df_to_train.values, labels_VR.ravel())

        cv_score = cross_val_score(clf, df_to_train.values, labels_VR.ravel(), cv=4)

        scores_cv_VR_to_VR.append(cv_score.mean())
        scores_training_VR_to_VR.append(clf.score(df_to_train.values, labels_VR.ravel()))
    else:
        scores_cv_VR_to_VR.append(np.NAN)
        scores_training_VR_to_VR.append(np.NAN)

  0%|          | 0/124750 [00:00<?, ?it/s]

100%|██████████| 124750/124750 [21:05<00:00, 98.59it/s] 


In [83]:
scores_VR_to_VR = pd.DataFrame(list(zip(genepairs_relief_VR['Gene 1'].values, genepairs_relief_VR['Gene 2'].values, scores_cv_VR_to_VR, scores_training_VR_to_VR)), columns=['Gene 1','Gene 2','cv_VR','train_VR'])

# Save scores to a csv file
scores_VR_to_VR.to_csv('../Output/scores_VR_to_VR_'+str(N_reliefF)+'.csv', index=False)

In [84]:
# Performance of VR gene pairs in MAYO dataset
scores_cv_VR_to_MAYO = []
scores_training_VR_to_MAYO = []
for pair in tqdm(genepairs_relief_VR.values.tolist()):
    if set(pair).issubset(genes_MAYO):
        df_to_train = df_normalized_MAYO[pair]

        clf = SVC(kernel='linear', C=100)
        clf.fit(df_to_train.values, labels_MAYO.ravel())

        cv_score = cross_val_score(clf, df_to_train.values, labels_MAYO.ravel(), cv=5)

        scores_cv_VR_to_MAYO.append(cv_score.mean())
        scores_training_VR_to_MAYO.append(clf.score(df_to_train.values, labels_MAYO.ravel()))
    else:
        scores_cv_VR_to_MAYO.append(np.NAN)
        scores_training_VR_to_MAYO.append(np.NAN)

100%|██████████| 124750/124750 [30:45<00:00, 67.59it/s] 


In [85]:
scores_VR_to_MAYO = pd.DataFrame(list(zip(genepairs_relief_VR['Gene 1'].values, genepairs_relief_VR['Gene 2'].values, scores_cv_VR_to_MAYO, scores_training_VR_to_MAYO)), columns=['Gene 1','Gene 2','cv_MAYO','train_MAYO'])

# Save scores to a csv file
scores_VR_to_MAYO.to_csv('../Output/scores_VR_to_MAYO_'+str(N_reliefF)+'.csv', index=False)

In [86]:
# Performance of VR gene pairs in GSE dataset
scores_cv_VR_to_GSE = []
scores_training_VR_to_GSE = []
for pair in tqdm(genepairs_relief_VR.values.tolist()):
    if set(pair).issubset(genes_GSE):
        df_to_train = df_normalized_GSE[pair]

        clf = SVC(kernel='linear', C=100)
        clf.fit(df_to_train.values, labels_GSE.ravel())

        cv_score = cross_val_score(clf, df_to_train.values, labels_GSE.ravel(), cv=5)

        scores_cv_VR_to_GSE.append(cv_score.mean())
        scores_training_VR_to_GSE.append(clf.score(df_to_train.values, labels_GSE.ravel()))
    else:
        scores_cv_VR_to_GSE.append(np.NAN)
        scores_training_VR_to_GSE.append(np.NAN)

100%|██████████| 124750/124750 [29:17<00:00, 70.98it/s] 


In [87]:
scores_VR_to_GSE = pd.DataFrame(list(zip(genepairs_relief_VR['Gene 1'].values, genepairs_relief_VR['Gene 2'].values, scores_cv_VR_to_GSE, scores_training_VR_to_GSE)), columns=['Gene 1','Gene 2','cv_GSE','train_GSE'])

# Save scores to a csv file
scores_VR_to_GSE.to_csv('../Output/scores_VR_to_GSE_'+str(N_reliefF)+'.csv', index=False)

In [88]:
# sumVR = []
# sumMAYO = []
# sumGSE = []
# # find out where each gene from genes_relief shows up in genes_combination and average the scores
# for gene in genes_relief:
#     # find where it shows up in genes_combination
#     gene_truth = np.logical_or( genes_combinations['Gene 1'].apply(lambda x: True if gene in x else False), genes_combinations['Gene 2'].apply(lambda x: True if gene in x else False) )
#     sumVR.append(scores[gene_truth].loc[:,'VR'].mean())
#     sumMAYO.append(scores[gene_truth].loc[:,'MAYO'].mean())
#     sumGSE.append(scores[gene_truth].loc[:,'GSE'].mean())


In [89]:
# d = {'sum of VR': sumVR, 'sum of MAYO': sumMAYO, 'sum of GSE': sumGSE}
# scores_by_gene = pd.DataFrame(d, index=genes_relief)
# scores_by_gene