# Random forest predictions
This notebook ingests the positive and negative training vectors as well as the prediction set as generated by the notebook ExtractDifferenceVectors.ipynb. It then perfoms Random forest learning and ranks the prediction set.

In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
data_directory = 'data'
if not os.path.isdir(data_directory):
    raise FileNotFoundError("Could not find data directory")
prediction_pickle_path = os.path.join(data_directory, "predictions.pkl")
positive_diff_pickle_path = os.path.join(data_directory, "positive-vectors.pkl")
negative_diff_pickle_path = os.path.join(data_directory, "negative-vectors.pkl")
diff_vectors_prediction = pd.read_pickle(prediction_pickle_path)
diff_vectors_pos = pd.read_pickle(positive_diff_pickle_path)
diff_vectors_neg = pd.read_pickle(negative_diff_pickle_path)

In [3]:
diff_vectors_prediction.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
ncbigene23552-meshd000008,1.22572,2.219805,-1.159192,-0.913386,1.242244,0.004557,-0.557847,-0.904537,-0.13532,0.972034,...,-1.242775,0.026413,-0.094516,-0.482394,0.212582,0.301373,-0.230201,0.222967,-0.262096,-0.199647
ncbigene23552-meshd000069293,0.728423,0.710524,-0.54786,-0.670773,0.928276,-0.44225,-1.206818,-1.240632,0.583069,0.921462,...,-1.901131,0.327931,0.20708,-1.047767,-0.375792,1.227519,-0.334099,0.439315,0.878214,-0.887344
ncbigene23552-meshd000069584,0.704195,2.368731,-0.721249,-0.975288,1.571413,0.523479,-1.412002,-1.116571,-0.460344,0.193849,...,-0.84025,0.282295,-0.871796,-1.058958,0.402296,0.42884,-0.754313,0.306938,0.330441,0.46229
ncbigene23552-meshd000070779,0.912242,2.347372,-0.902853,-0.862418,-0.174117,-0.02516,-0.054016,-0.790153,-0.085269,1.285809,...,-0.897832,-0.262245,-0.09231,-1.144869,-0.269114,1.008527,-0.859037,0.939902,1.008549,-0.443027
ncbigene23552-meshd000071380,2.146233,1.374384,-0.688913,0.034746,1.313125,0.79784,-1.370878,-0.273002,1.072562,1.761501,...,-1.722041,-0.544126,-0.546446,-0.807326,0.847525,1.17818,-0.027069,0.244723,0.51521,0.377675


In [4]:
diff_vectors_pos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
ncbigene1956-meshd002289,-0.658227,1.228249,-0.556157,-0.571083,-0.107215,0.426374,0.671859,0.527287,-0.252452,1.403538,...,-1.924055,0.002242,0.494866,-0.138703,0.016181,-1.415368,0.792387,-0.136395,-0.910239,1.851647
ncbigene2064-meshd002289,-1.328087,-0.204334,-1.170295,-1.143805,-0.218962,0.010136,1.089562,1.061568,-0.133885,1.831318,...,-0.472625,-0.264935,0.50144,-0.21732,0.255144,-1.773365,1.888879,-0.58648,-1.102478,0.539014
ncbigene1956-meshd002294,0.791421,2.460823,-0.351139,-0.875576,1.956512,0.327444,-0.054564,0.171105,-0.975969,2.903467,...,-0.02711,-0.01276,1.70709,-0.364082,-0.049707,-1.025946,1.70025,0.968731,0.29231,0.467492
ncbigene2064-meshd002294,0.121561,1.02824,-0.965277,-1.448299,1.844765,-0.088794,0.363139,0.705387,-0.857402,3.331247,...,1.42432,-0.279938,1.713664,-0.442699,0.189256,-1.383944,2.796743,0.518645,0.100071,-0.845141
ncbigene1956-meshd008175,0.275792,2.281943,-0.222486,-0.896599,1.665686,0.00706,0.389148,-0.372689,-0.387075,2.311301,...,-2.37192,0.063723,1.340364,-0.423427,0.160679,-0.418997,1.445924,-0.183771,-2.022009,1.958475


In [5]:
diff_vectors_neg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
ncbigene10641-meshd007972,-1.444506,1.66028,-0.139696,-1.878792,-0.227101,-0.59747,1.017172,-1.016495,-1.798462,-0.48296,...,1.309722,1.559093,1.585517,-0.352169,0.036647,0.335446,-0.870959,0.243426,-0.22555,-0.600007
ncbigene283455-meshd004416,-3.002347,1.953215,-0.589901,-2.35814,-0.619326,-0.572878,1.876017,-1.135545,-2.720654,0.354074,...,2.674644,2.077172,0.583725,0.431914,0.421701,-0.54228,-1.994261,0.446271,-1.536367,-1.087888
ncbigene389840-meshd018275,-3.5361,2.658201,0.268346,-2.93181,-0.619421,-0.405464,3.005201,-2.053312,-4.07479,-1.107726,...,2.268621,2.11608,1.541182,-0.269708,-0.443933,-0.614615,-2.555185,1.175516,-1.883217,0.276424
ncbigene79072-meshd002295,-4.308005,1.777995,-0.564299,-4.428386,-1.181217,-1.116484,2.817219,-1.97482,-3.896451,-1.552033,...,1.443576,1.322417,1.70003,-0.274958,-1.222435,-1.946449,-2.163091,0.827324,-1.040396,-1.293284
ncbigene2065-meshd018293,1.385457,0.649354,-0.94848,-0.751665,-0.093749,-0.658072,1.907968,0.348456,-0.213274,1.846046,...,-1.086175,-0.541409,0.719002,-0.061235,1.545214,-0.276902,1.756059,0.291862,-0.286253,-0.300837


# Random Forest
## Create the training set by concatinaning diff_vectors_pos and diff_vectors_neg

In [6]:
train_data = [diff_vectors_pos,diff_vectors_neg]
X_train = pd.concat(train_data)

In [7]:
label_1 = np.ones(diff_vectors_pos.shape[0])
label_0 = np.zeros(diff_vectors_neg.shape[0])
label_train = np.concatenate((label_1,label_0))
y_train = label_train

## Create the test set. 
### Test set is the prediction set with one label (either 0 or 1)

In [8]:
X_test = diff_vectors_prediction
label_test = np.ones(diff_vectors_prediction.shape[0])
y_test = label_test

## Parameters of random forest that are used to find the best model.
#### Here, I used only n_estimators (number of trees) but we may add more parameters like min_samples_leaf or max_depth, too.

In [9]:
param_grid = {
                 'n_estimators': [10, 20, 50, 100, 150, 200],
                # 'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True),
                # 'max_depth' : np.linspace(1, 10, 5, endpoint=True),
             }

## Search over the parameters to choose the best model

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

clf = RandomForestClassifier()
grid_clf = GridSearchCV(clf, param_grid, cv=10)
grid_clf.fit(X_train,y_train)

best_model = grid_clf.best_estimator_

# Use the best model of the random forest to predict the links between kinases and cancers in test set

In [11]:
y_pred = best_model.predict(X_test)
yproba = best_model.predict_proba(X_test)[::,1]

In [12]:
## Create a ncbigene to gene symbol map

In [13]:
from collections import defaultdict

disease_mesh = pd.read_csv("../input/neoplasms_labels.tsv",  sep= "\t", header = None)
meshid2disease_map = defaultdict()
for i in disease_mesh.index:
        mesh = disease_mesh.iloc[i][0]
        mesh_first_letter = mesh[0].lower()
        mesh_id = "mesh" + mesh_first_letter + mesh[1:]
        disease = disease_mesh.iloc[i][1]
        meshid2disease_map[mesh_id] = disease

## Create a mesh_id to cancer map

In [14]:
kinase_gene_id = pd.read_csv("../input/prot_kinase.tsv",  sep= "\t", header = None)
ncbigene2symbol_map = defaultdict()
for i in kinase_gene_id.index:
    gene_symbol = kinase_gene_id.iloc[i][0]
    ncbigene = kinase_gene_id.iloc[i][2]
    ncbigene_id = "ncbigene" + str(ncbigene)
    ncbigene2symbol_map[ncbigene_id] = gene_symbol

## Get the gene symbols and cancer names of the test (prediction) set

In [15]:
gene_symbol_list = []
cancer_list = []
for vec in X_test.index:
    fields = vec.split("-")
    ncbi_gene = fields[0]
    mesh_cancer = fields[1]
    #print(mesh_cancer)
    gene_symbol = ncbigene2symbol_map[ncbi_gene]
    gene_symbol_list.append(gene_symbol)
    cancer = meshid2disease_map[mesh_cancer]
    cancer_list.append(cancer)

## Add three columns to the test (prediction) set: gene_symbol, cancer and probability

In [16]:
X_test.insert(0,"gene_symbol", gene_symbol_list, True)
X_test.insert(1,"cancer", cancer_list, True)
X_test.insert(2,"probability",yproba, True)

In [17]:
X_test.head()

Unnamed: 0,gene_symbol,cancer,probability,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
ncbigene23552-meshd000008,CDK20,Abdominal Neoplasms,0.066667,1.22572,2.219805,-1.159192,-0.913386,1.242244,0.004557,-0.557847,...,-1.242775,0.026413,-0.094516,-0.482394,0.212582,0.301373,-0.230201,0.222967,-0.262096,-0.199647
ncbigene23552-meshd000069293,CDK20,Plasmablastic Lymphoma,0.053333,0.728423,0.710524,-0.54786,-0.670773,0.928276,-0.44225,-1.206818,...,-1.901131,0.327931,0.20708,-1.047767,-0.375792,1.227519,-0.334099,0.439315,0.878214,-0.887344
ncbigene23552-meshd000069584,CDK20,Unilateral Breast Neoplasms,0.166667,0.704195,2.368731,-0.721249,-0.975288,1.571413,0.523479,-1.412002,...,-0.84025,0.282295,-0.871796,-1.058958,0.402296,0.42884,-0.754313,0.306938,0.330441,0.46229
ncbigene23552-meshd000070779,CDK20,Giant Cell Tumor of Tendon Sheath,0.026667,0.912242,2.347372,-0.902853,-0.862418,-0.174117,-0.02516,-0.054016,...,-0.897832,-0.262245,-0.09231,-1.144869,-0.269114,1.008527,-0.859037,0.939902,1.008549,-0.443027
ncbigene23552-meshd000071380,CDK20,"Fibromatosis, Plantar",0.02,2.146233,1.374384,-0.688913,0.034746,1.313125,0.79784,-1.370878,...,-1.722041,-0.544126,-0.546446,-0.807326,0.847525,1.17818,-0.027069,0.244723,0.51521,0.377675


In [18]:
sorted_X_test = X_test.sort_values(by=['probability'],ascending=False)
sorted_X_test.head()

Unnamed: 0,gene_symbol,cancer,probability,0,1,2,3,4,5,6,...,90,91,92,93,94,95,96,97,98,99
ncbigene3815-meshd008175,KIT,Lung Neoplasms,0.906667,0.288533,2.507522,0.448143,-1.294022,2.052601,0.675892,1.517713,...,-1.042771,0.222485,2.924645,-0.231092,-0.622109,-0.862323,1.772026,-1.527972,-3.161463,2.481583
ncbigene3815-meshd015451,KIT,"Leukemia, Lymphocytic, Chronic, B-Cell",0.866667,0.047748,-1.133677,-0.127944,-1.607289,2.63993,0.514353,0.339744,...,-0.7223,0.166669,1.097318,-0.322751,0.03446,0.074204,0.819606,-0.808408,0.126436,0.780186
ncbigene3815-meshd001943,KIT,Breast Neoplasms,0.86,0.058277,4.557331,1.115827,-0.500111,1.830377,1.010869,-0.254133,...,-1.298499,0.330733,0.897668,-0.069067,-0.638238,-0.184169,-0.554205,-0.859058,-1.834632,2.88399
ncbigene4233-meshd008175,MET,Lung Neoplasms,0.86,-0.60935,1.494694,-0.36656,-2.052619,1.76229,0.635862,1.609924,...,-2.083422,-0.070844,1.282095,-0.294274,0.115311,-0.548996,0.986837,-1.008263,-2.448243,1.988456
ncbigene5156-meshd010039,PDGFRA,Otorhinolaryngologic Neoplasms,0.84,4.629136,-0.012162,-0.134491,2.670988,2.390139,1.377803,-2.766031,...,-1.812804,-1.761475,-0.814851,-0.167754,0.193421,1.638709,2.697381,-1.144037,0.662207,1.897674


In [19]:
predictions = sorted_X_test[["gene_symbol","cancer","probability"]]
predictions.head(n=20)

Unnamed: 0,gene_symbol,cancer,probability
ncbigene3815-meshd008175,KIT,Lung Neoplasms,0.906667
ncbigene3815-meshd015451,KIT,"Leukemia, Lymphocytic, Chronic, B-Cell",0.866667
ncbigene3815-meshd001943,KIT,Breast Neoplasms,0.86
ncbigene4233-meshd008175,MET,Lung Neoplasms,0.86
ncbigene5156-meshd010039,PDGFRA,Otorhinolaryngologic Neoplasms,0.84
ncbigene5156-meshd018228,PDGFRA,"Sarcoma, Small Cell",0.826667
ncbigene3815-meshd007938,KIT,Leukemia,0.826667
ncbigene5156-meshd008339,PDGFRA,Mandibular Neoplasms,0.82
ncbigene3815-meshd006689,KIT,Hodgkin Disease,0.82
ncbigene5156-meshd002578,PDGFRA,Uterine Cervical Dysplasia,0.82


In [20]:
predictions.to_csv("predictions_2021.tsv",index=False,sep="\t")