In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
from scipy.spatial.distance import cosine
from collections import defaultdict
from random import randint
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, average_precision_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


In this notebook, we predict new kinase-disease links using the RF classifier. To do that, we use datasets of kinase-cancer links of 2015 and kinase-cancer links of 2020. The links existing in kinase-cancer 2015 will be the positive training set and the links in 2020 that don't exist in 2015 will be positive test links. Negative trainig edges are randomly chosen non-links in 2015 that are not in positive test links. Negative test edges are randomly chose non-links in 2020.

## Note:
#### The latests embedding.npy and word.txt are in the folders "dim50" and "dim100" in https://drive.google.com/drive/u/0/folders/1rkfOQ5EgV0_qbJvXaPQd06RqHbQXY_WA.
#### Send your gmail address to vidarmehr@gmail.com to have access the files. 

## Load embeddings

In [50]:
embedding = np.load("data/before2018/embedding_dim200_skipgram_2018.npy", mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII')

## Load pubmed words

In [51]:
words = []
word_count = 0
with open("data/before2018/words_2018.txt","r") as f:
    for line in f:
        word = line[2:-3]
        words.append(word)
        word_count += 1
print(word_count)        
        

229274


## Create a dataframe of words and embeddings 

In [4]:
df = pd.DataFrame(data=embedding,index = words)
df.head(n=10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
cell,-3.429195,3.430483,-4.907708,4.495084,2.223799,-3.300699,2.145732,-2.7544,1.000114,-2.422389,...,1.969542,-3.123235,-3.230351,-2.623941,2.730077,-2.821894,2.709452,-2.529269,2.803039,-3.782199
patient,-2.823424,3.542139,-3.555432,2.935642,3.571199,-3.978612,1.796856,-2.507029,1.354829,-2.038317,...,2.564375,-2.676333,-3.207936,-3.232185,3.204765,-4.226576,2.095013,-2.879783,3.840945,-4.669153
meshd009369,-2.60797,3.37443,-3.843605,3.018076,3.055531,-3.771113,1.75331,-2.834063,0.891717,-2.056341,...,2.119789,-3.382027,-3.372543,-3.332763,2.804443,-3.961338,1.707528,-2.762952,3.296864,-4.144413
studi,-2.967239,4.113805,-4.213249,2.891041,2.534088,-2.909812,2.658576,-3.587436,0.851083,-2.149255,...,2.208216,-2.647985,-3.918214,-3.326091,2.663495,-3.441043,2.491069,-2.598676,3.606502,-4.447232
express,-3.037426,2.555344,-3.201977,2.945295,2.50457,-3.649581,2.684163,-3.164925,1.061155,-2.23399,...,2.121447,-2.99291,-2.789394,-2.702133,2.949669,-3.424434,1.767411,-2.73675,3.828856,-3.606953
0,-3.433465,3.155048,-4.151999,2.361702,4.215919,-3.036508,3.155546,-2.527822,1.147282,-1.535147,...,2.430419,-4.175333,-3.238241,-3.578023,2.909658,-3.585797,1.576796,-2.237536,3.975976,-3.760139
use,-3.021726,3.092827,-4.225416,2.717899,2.234242,-2.977322,2.018353,-3.990392,0.517792,-2.057612,...,2.1561,-2.818921,-3.793691,-2.594279,2.928304,-3.824185,2.440206,-2.342977,4.112683,-3.769578
activ,-3.081957,3.394082,-3.764299,3.095361,2.717619,-2.870403,3.053381,-3.402558,0.964453,-2.508792,...,2.019854,-2.824445,-3.400595,-1.951133,2.928699,-2.525934,2.155754,-3.005216,3.619453,-4.145101
result,-2.854495,3.528433,-3.578857,2.480143,3.353607,-3.399108,2.618502,-3.407653,0.802987,-2.027899,...,2.508424,-3.015148,-3.014844,-2.925622,2.415649,-3.261499,2.451554,-3.168533,2.481572,-3.959124
1,-3.487416,3.9249,-3.467685,3.655912,4.170876,-3.042357,2.794878,-2.779564,0.846294,-2.139251,...,2.246127,-2.816232,-3.096816,-2.737211,3.319406,-2.673875,2.04229,-2.219684,3.778319,-3.873589


# Positive training data:
## Load the positive training (kinase_cancer links before 2018)


In [5]:
pos_train = pd.read_csv("../prediction/output/pos_train.tsv",  sep= "\t")

In [6]:
pos_train.head()

Unnamed: 0,kinase,cancer
0,ncbigene1019,meshd001943
1,ncbigene1021,meshd001943
2,ncbigene695,meshd007938
3,ncbigene695,meshd015448
4,ncbigene695,meshd015451


In [7]:
pos_train.shape

(340, 2)

## Calculate the difference between the kinases and mesh id of their corresponding diseases

In [8]:
diff_kinase_mesh_list_pos_train = []
diff_index_pos_train = []
for i in pos_train.index:
    ncbigene_id = pos_train.iloc[i][0]
    mesh_id = pos_train.iloc[i][1]
    if ncbigene_id in df.index:
        ncbigene_id_embedding = df.loc[ncbigene_id]
    else:
        print("The gene {} does not exist in Pubmed". format(ncbigene_id))
    if mesh_id in df.index:
        mesh_id_embedding = df.loc[mesh_id]
    else:
        print("The gene {} does not exist in Pubmed". format(ncbigene_id))   
    if ncbigene_id in df.index and mesh_id in df.index:     
        diff_kinase_mesh = np.subtract(ncbigene_id_embedding, mesh_id_embedding)
        diff_kinase_mesh_list_pos_train.append(diff_kinase_mesh)
        diff_index_pos_train.append(ncbigene_id + "," + mesh_id)
        #print(diff_gene_mesh)

The gene ncbigene3791 does not exist in Pubmed
The gene ncbigene3815 does not exist in Pubmed
The gene ncbigene5159 does not exist in Pubmed
The gene ncbigene5604 does not exist in Pubmed
The gene ncbigene5604 does not exist in Pubmed
The gene ncbigene5605 does not exist in Pubmed
The gene ncbigene5605 does not exist in Pubmed
The gene ncbigene673 does not exist in Pubmed
The gene ncbigene2475 does not exist in Pubmed
The gene ncbigene2322 does not exist in Pubmed


## Create a new dataframe, each row index is the kinase(ncbigene_id) and disease(mesh_id) and the columns represent a diff vector between the vector of kinase(ncbigene_id) and the vector of disease (mesh_id)

In [9]:
df_diff_kinase_mesh_pos_train = pd.DataFrame(diff_kinase_mesh_list_pos_train, index = diff_index_pos_train) 


In [10]:
df_diff_kinase_mesh_pos_train.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
"ncbigene1019,meshd001943",0.645031,-1.229598,0.449641,-0.951447,-0.139956,-1.422457,-0.128031,-0.256534,0.004652,-0.582405,...,0.435474,0.108583,2.182738,1.007717,0.186229,0.564442,0.752749,1.222828,-1.73625,0.988223
"ncbigene1021,meshd001943",0.718419,-1.39397,0.666648,-1.179012,-0.089089,-0.995026,-0.221107,-0.539914,0.330991,-0.315522,...,0.316588,0.174341,1.937052,0.833753,-0.344218,0.382732,0.285402,1.341599,-1.841794,0.702362
"ncbigene695,meshd007938",1.615346,0.096558,1.470859,-0.751554,0.681082,-0.280155,-0.336288,-0.059722,0.827442,0.486274,...,0.187167,-1.008859,1.099493,0.351412,-0.201364,-0.245184,-0.129842,-0.178331,-0.388366,0.605667
"ncbigene695,meshd015448",-0.48981,-0.371626,-0.144233,0.434399,0.692161,-0.055758,-0.569702,0.25207,0.364112,0.137557,...,0.460557,0.005423,-1.093264,-0.193959,0.455186,-0.824109,1.237214,-0.478984,-0.797055,-0.248794
"ncbigene695,meshd015451",0.778309,-0.35195,1.262915,-0.86522,0.765387,0.33966,0.055455,0.018941,1.204813,0.47561,...,0.056768,-1.344335,0.0583,0.048279,-0.383961,0.294819,0.220823,0.041617,-0.513644,0.421426


In [11]:
df_diff_kinase_mesh_pos_train.shape


(330, 200)

# Positive test data 
## Load the positive test (kinase_cancer links that are in 2020 but not in 2018)

In [12]:
pos_test = pd.read_csv("../prediction/output/pos_test.tsv",  sep= "\t")

In [13]:
pos_test.head()

Unnamed: 0,kinase,cancer
0,ncbigene2064,meshd055756
1,ncbigene3791,meshd016411
2,ncbigene3815,meshd016411
3,ncbigene5159,meshd016411
4,ncbigene3716,meshd015473


In [14]:
pos_test.shape

(11, 2)

## Calculate the difference between the kinases and mesh id of their corresponding diseases

In [15]:
diff_kinase_mesh_list_pos_test = []
diff_index_pos_test = []
for i in pos_test.index:
    ncbigene_id = pos_test.iloc[i][0]
    mesh_id = pos_test.iloc[i][1]
    if ncbigene_id in df.index:
        ncbigene_id_embedding = df.loc[ncbigene_id]
    else:
        print("The gene {} does not exist in Pubmed". format(ncbigene_id))
    if mesh_id in df.index:
        mesh_id_embedding = df.loc[mesh_id]
    else:
        print("The gene {} does not exist in Pubmed". format(ncbigene_id)) 
    if ncbigene_id in df.index and mesh_id in df.index:    
        diff_kinase_mesh = np.subtract(ncbigene_id_embedding, mesh_id_embedding)
        diff_kinase_mesh_list_pos_test.append(diff_kinase_mesh)
        diff_index_pos_test.append(ncbigene_id + "," + mesh_id)
        #print(diff_gene_mesh)

The gene ncbigene673 does not exist in Pubmed


## Create a new dataframe, each row index is the kinase(ncbigene_id) and disease(mesh_id) and the columns represent a diff vector between the vector of kinase(ncbigene_id) and the vector of disease (mesh_id)

In [16]:
df_diff_kinase_mesh_pos_test = pd.DataFrame(diff_kinase_mesh_list_pos_test, index = diff_index_pos_test) 

In [17]:
df_diff_kinase_mesh_pos_test.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
"ncbigene2064,meshd055756",0.473348,-0.031405,-0.198754,-0.236676,1.07346,-0.210041,-0.005317,-1.335178,-0.122004,-0.958364,...,-0.716877,-0.243717,0.25442,-0.339188,-0.159105,0.301588,0.245764,-0.902606,-0.344071,-0.879507
"ncbigene3791,meshd016411",0.466953,-0.839341,-1.513611,0.324279,1.174675,0.840499,-0.090144,-0.320323,0.71638,-0.015174,...,0.012606,-1.743181,-1.350277,-0.28355,0.42865,-0.289973,-0.922578,-0.13297,-0.546721,0.719477
"ncbigene3815,meshd016411",0.148374,-1.327092,-1.563375,0.478446,0.373743,0.622484,-0.491219,0.244073,0.841282,0.074475,...,-0.384615,-0.689007,-0.374499,-0.006718,0.164616,0.540523,-0.547015,-0.47836,-0.549634,0.529842
"ncbigene5159,meshd016411",0.005088,-1.029035,-1.66278,-0.214097,1.637076,1.388383,-0.448921,0.273737,0.687404,-0.253208,...,0.12512,-1.419705,-0.700991,-0.255343,-0.45254,0.352808,-0.579554,-0.195258,-0.853608,0.752421
"ncbigene3716,meshd015473",0.927395,0.493332,0.724157,-0.153922,0.328249,-0.56566,-0.514364,-0.749902,-0.029036,1.10481,...,-0.38611,-1.978491,0.163492,-0.536626,0.164676,0.037303,0.035557,-0.42093,-1.047781,1.011296


In [18]:
df_diff_kinase_mesh_pos_test.shape

(10, 200)

## Negative train data
## Read the negative training (kinase-cancer links that are not in 2018 and also are not in 2020)

In [19]:
neg_train = pd.read_csv("../prediction/output/neg_train.tsv",  sep= "\t")

In [20]:
neg_train.head()

Unnamed: 0,kinase,cancer
0,ncbigene25,meshd008545
1,ncbigene3815,meshd015464
2,ncbigene5159,meshd000070779
3,ncbigene2324,meshd015464
4,ncbigene3815,meshd018281


In [21]:
neg_train.shape

(340, 2)

## Calculate the difference between the kinases and mesh id of their corresponding diseases

In [22]:
diff_kinase_mesh_list_neg_train = []
diff_index_neg_train = []
for i in neg_train.index:
    ncbigene_id = neg_train.iloc[i][0]
    mesh_id = neg_train.iloc[i][1]
    if ncbigene_id in df.index:
        ncbigene_id_embedding = df.loc[ncbigene_id]
    else:
        print("The gene {} does not exist in Pubmed". format(ncbigene_id))
    if mesh_id in df.index:
        mesh_id_embedding = df.loc[mesh_id]
    else:
        print("The gene {} does not exist in Pubmed". format(ncbigene_id)) 
    if ncbigene_id in df.index and mesh_id in df.index:    
        diff_kinase_mesh = np.subtract(ncbigene_id_embedding, mesh_id_embedding)
        diff_kinase_mesh_list_neg_train.append(diff_kinase_mesh)
        diff_index_neg_train.append(ncbigene_id + "," + mesh_id)
        #print(diff_gene_mesh)

The gene ncbigene5159 does not exist in Pubmed
The gene ncbigene1956 does not exist in Pubmed
The gene ncbigene25 does not exist in Pubmed
The gene ncbigene2261 does not exist in Pubmed
The gene ncbigene25 does not exist in Pubmed
The gene ncbigene25 does not exist in Pubmed
The gene ncbigene673 does not exist in Pubmed
The gene ncbigene25 does not exist in Pubmed
The gene ncbigene2260 does not exist in Pubmed
The gene ncbigene6098 does not exist in Pubmed
The gene ncbigene5159 does not exist in Pubmed
The gene ncbigene5979 does not exist in Pubmed
The gene ncbigene25 does not exist in Pubmed
The gene ncbigene1956 does not exist in Pubmed
The gene ncbigene1956 does not exist in Pubmed
The gene ncbigene1432 does not exist in Pubmed


## Create a new dataframe, each row index is the kinase(ncbigene_id) and disease(mesh_id) and the columns represent a diff vector between the vector of kinase(ncbigene_id) and the vector of disease (mesh_id)

In [23]:
df_diff_kinase_mesh_neg_train = pd.DataFrame(diff_kinase_mesh_list_neg_train, index = diff_index_neg_train) 

In [24]:
df_diff_kinase_mesh_neg_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
"ncbigene25,meshd008545",-0.275223,-1.629256,-0.3913,-0.296422,0.389136,1.541877,-0.588302,-0.883064,0.816166,0.085749,...,0.530372,1.728075,-0.565419,1.019037,0.661705,2.16904,0.936319,0.836255,-1.319795,0.18802
"ncbigene3815,meshd015464",0.556873,-0.056583,1.124562,0.673023,-0.781627,-0.484396,-0.042291,0.078979,-0.319377,0.172815,...,-0.12868,-1.004372,1.114176,0.025132,-0.352083,-1.401584,-0.293069,-0.287004,0.120343,1.767555
"ncbigene5159,meshd000070779",-0.675464,-1.785326,-1.757368,0.446683,2.28565,0.403408,0.256836,-0.265196,0.718134,-0.993482,...,0.219028,-0.722577,-1.077595,-0.878194,0.673351,-0.435439,0.884169,-0.840237,0.08856,-0.496099
"ncbigene2324,meshd015464",0.008182,0.649565,1.923661,0.777387,-0.198684,-0.012191,0.028292,-0.496388,-0.411921,0.234437,...,0.236838,-2.843143,0.743395,0.03013,-0.919928,-1.41701,-1.337908,-0.232784,-0.012505,2.635339
"ncbigene3815,meshd018281",-0.735744,-1.439935,-1.056114,0.359917,0.23108,1.660611,0.610272,-0.639936,1.220372,0.032012,...,-1.04651,0.992446,-0.467213,0.648147,-0.415334,0.525473,1.26503,-0.428805,-0.397529,0.333515


In [25]:
df_diff_kinase_mesh_neg_train.shape

(324, 200)

## Negative test data
## Load negative test data (kinase-cancer links that are not in 2020)

In [26]:
neg_test = pd.read_csv("../prediction/output/neg_test.tsv",  sep= "\t")

In [27]:
neg_test.head()

Unnamed: 0,kinase,cancer
0,ncbigene2260,meshd000077195
1,ncbigene5894,meshd007889
2,ncbigene695,meshd001650
3,ncbigene5159,meshd017253
4,ncbigene3791,meshd007938


In [28]:
neg_test.shape

(11, 2)

## Calculate the difference between the kinases and mesh id of their corresponding diseases

In [29]:
diff_kinase_mesh_list_neg_test = []
diff_index_neg_test = []
for i in neg_test.index:
    ncbigene_id = neg_test.iloc[i][0]
    mesh_id = neg_test.iloc[i][1]
    if ncbigene_id in df.index:
        ncbigene_id_embedding = df.loc[ncbigene_id]
    else:
        print("The gene {} does not exist in Pubmed". format(ncbigene_id))
    if mesh_id in df.index:
        mesh_id_embedding = df.loc[mesh_id]
    else:
        print("The gene {} does not exist in Pubmed". format(ncbigene_id)) 
    if ncbigene_id in df.index and mesh_id in df.index:    
        diff_kinase_mesh = np.subtract(ncbigene_id_embedding, mesh_id_embedding)
        diff_kinase_mesh_list_neg_test.append(diff_kinase_mesh)
        diff_index_neg_test.append(ncbigene_id + "," + mesh_id)
        #print(diff_gene_mesh)

The gene ncbigene5159 does not exist in Pubmed


## Create a new dataframe, each row index is the kinase(ncbigene_id) and disease(mesh_id) and the columns represent a diff vector between the vector of kinase(ncbigene_id) and the vector of disease (mesh_id)

In [30]:
df_diff_kinase_mesh_neg_test = pd.DataFrame(diff_kinase_mesh_list_neg_test, index = diff_index_neg_test) 

In [31]:
df_diff_kinase_mesh_neg_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
"ncbigene2260,meshd000077195",0.413416,-1.838235,-0.034835,-1.188352,0.163444,0.241793,0.255841,0.020517,1.278495,0.869979,...,-0.287875,-0.061632,0.814795,0.683264,-0.098533,1.038488,0.924179,0.959332,-0.862425,1.125554
"ncbigene5894,meshd007889",-0.500058,-1.089374,-0.262553,-1.079296,0.142536,-0.099344,0.770053,-0.690886,0.621456,-0.093109,...,-0.167283,-0.716703,0.885172,-0.029516,-0.040384,0.944371,0.552219,-0.353673,-0.924916,-0.279773
"ncbigene695,meshd001650",-0.234268,-0.948104,-0.571462,-1.841508,0.599614,1.211595,1.199872,-0.181953,1.541596,0.193697,...,-0.828726,0.058841,0.340955,-0.5696,0.097112,0.8889,1.768819,-0.115492,-1.390665,-1.360657
"ncbigene3791,meshd007938",1.396197,-0.285446,0.804154,0.884791,0.903085,-0.531901,-0.531521,-0.948196,0.253742,-0.251284,...,0.439445,-1.696617,0.092841,0.441401,0.285139,-1.598563,-0.558137,0.093295,0.448481,1.883524
"ncbigene4233,meshd055752",0.740749,-1.650128,0.192176,-0.325912,0.706194,-0.734141,-0.046035,-1.177096,0.657244,-0.360282,...,0.180198,-0.290008,-0.145783,-0.314676,0.276578,0.500918,-0.334542,-0.547862,1.114566,0.722291


In [32]:
df_diff_kinase_mesh_neg_test.shape

(10, 200)

## Ceate trianing data by concatinating positive and negative test data

In [33]:
df_train = [df_diff_kinase_mesh_pos_train,df_diff_kinase_mesh_neg_train]
X_train = pd.concat(df_train)

## Create labels for training data (label 1 for positive, 0 for negative data)

In [34]:
label_1 = np.ones(df_diff_kinase_mesh_pos_train.shape[0])
label_0 = np.zeros(df_diff_kinase_mesh_neg_train.shape[0])
label_train = np.concatenate((label_1,label_0))
y_train = label_train

## Ceate test data by concatinating positive and negative test data

In [35]:
df_test = [df_diff_kinase_mesh_pos_test,df_diff_kinase_mesh_neg_test]
X_test = pd.concat(df_test)

## Create labels for test data (label 1 for positive, 0 for negative data)

In [36]:
label_1 = np.ones(df_diff_kinase_mesh_pos_test.shape[0])
label_0 = np.zeros(df_diff_kinase_mesh_neg_test.shape[0])
label_test = np.concatenate((label_1,label_0))
y_test = label_test

## Random Forest classifeir

In [37]:
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)

RandomForestClassifier()

In [38]:
y_pred=clf.predict(X_test)

In [39]:
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision (Positive Predictive Value):",metrics.precision_score(y_test, y_pred))
print("Recall (Sensitivity):",metrics.recall_score(y_test, y_pred))
print("f1_score:",metrics.f1_score(y_test, y_pred))


Accuracy: 0.75
Precision (Positive Predictive Value): 0.7777777777777778
Recall (Sensitivity): 0.7
f1_score: 0.7368421052631577


In [40]:
yproba = clf.predict_proba(X_test)[::,1]

In [41]:
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc_test = roc_auc_score(y_test, yproba)

In [42]:
auc_test

0.775

In [43]:
#y_test

In [44]:
#y_pred

In [45]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
"ncbigene2064,meshd055756",0.473348,-0.031405,-0.198754,-0.236676,1.07346,-0.210041,-0.005317,-1.335178,-0.122004,-0.958364,...,-0.716877,-0.243717,0.25442,-0.339188,-0.159105,0.301588,0.245764,-0.902606,-0.344071,-0.879507
"ncbigene3791,meshd016411",0.466953,-0.839341,-1.513611,0.324279,1.174675,0.840499,-0.090144,-0.320323,0.71638,-0.015174,...,0.012606,-1.743181,-1.350277,-0.28355,0.42865,-0.289973,-0.922578,-0.13297,-0.546721,0.719477
"ncbigene3815,meshd016411",0.148374,-1.327092,-1.563375,0.478446,0.373743,0.622484,-0.491219,0.244073,0.841282,0.074475,...,-0.384615,-0.689007,-0.374499,-0.006718,0.164616,0.540523,-0.547015,-0.47836,-0.549634,0.529842
"ncbigene5159,meshd016411",0.005088,-1.029035,-1.66278,-0.214097,1.637076,1.388383,-0.448921,0.273737,0.687404,-0.253208,...,0.12512,-1.419705,-0.700991,-0.255343,-0.45254,0.352808,-0.579554,-0.195258,-0.853608,0.752421
"ncbigene3716,meshd015473",0.927395,0.493332,0.724157,-0.153922,0.328249,-0.56566,-0.514364,-0.749902,-0.029036,1.10481,...,-0.38611,-1.978491,0.163492,-0.536626,0.164676,0.037303,0.035557,-0.42093,-1.047781,1.011296
"ncbigene3717,meshd015473",0.253756,0.095241,0.223217,0.069438,0.253161,0.08405,-0.566948,-1.670377,-0.044205,0.167995,...,0.062858,-1.640578,-0.31954,-0.905041,0.758557,0.15969,0.072651,-1.176236,-0.974579,0.539104
"ncbigene2475,meshd007889",-0.736581,0.776767,0.78933,-0.08074,-0.331397,-1.362346,0.851257,-2.173742,-0.727902,-0.677367,...,0.203523,-0.893481,0.515852,0.193489,-0.140006,0.89781,0.405455,-0.153095,-0.549004,-1.42197
"ncbigene2475,meshd018231",-0.700751,0.54417,0.208452,0.414064,0.431868,-0.744727,0.829871,-2.809302,-0.98521,-1.152322,...,0.492772,-0.4345,0.115818,0.004773,0.669765,1.03194,0.346311,-0.379027,-0.024906,-1.361534
"ncbigene2475,meshd047708",-0.809665,1.241298,-0.605224,1.416535,1.382623,-1.656786,1.339292,-1.931746,-0.634407,-1.061892,...,1.355092,-0.639846,-0.244064,-0.038651,0.714315,0.390026,0.516655,-1.56415,0.808537,-1.834245
"ncbigene2475,meshd018329",-1.296096,2.064562,-0.43417,1.841447,1.408997,-2.607832,1.524788,-2.486873,-0.306497,-1.323663,...,1.434099,-0.841582,-0.237399,-0.296321,1.861626,0.309186,0.760351,-0.538127,1.937584,-2.6724


In [46]:
y_test

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [47]:
y_pred

array([0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 1.])

In [48]:
yproba

array([0.46, 0.46, 0.58, 0.6 , 0.31, 0.55, 0.58, 0.55, 0.56, 0.61, 0.48,
       0.47, 0.36, 0.47, 0.19, 0.6 , 0.41, 0.03, 0.16, 0.51])