# 2010 Predictions
Here we show how we did the analysis of the predictions stemming from the 2010 data as described in the results section of the manuscript.
To obtain the data sources needed to run this notebook, we need to download files from the project zenodo repository. See the ``novelPredictions`` notebook for details.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report, precision_recall_curve,precision_score, recall_score
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline  
import pickle5 as pickle
sys.path.insert(0, os.path.abspath('..'))
from kcet import KcetDatasetGenerator, KcetRandomForest

In [2]:
download_dir = input()

 /home/peter/data/pubmed2vec


In [3]:
ctfile = os.path.join(download_dir, "clinical_trials_by_phase.tsv")
embeddings = os.path.join(download_dir, "embedding_SG_dim100_upto2010.npy")
words =  os.path.join(download_dir, "words_SG_upto2010.txt")
if not os.path.isfile(ctfile):
    raise FileNotFoundError("Could not find clinical trials file at %s" % ctfile)
if not os.path.isfile(embeddings):
    raise FileNotFoundError("Could not find 2010 embeddings file at %s" % embeddings)
if not os.path.isfile(words):
    raise FileNotFoundError("Could not find 2010 words file at %s" % words)  

In [4]:
target_year = 2021 # predict up to the current date from 2010 training data
dsGen = KcetDatasetGenerator(clinical_trials=ctfile, embeddings=embeddings, words=words)

In [6]:
positive_training_df, negative_training_df, prediction_df = dsGen.get_data_for_novel_prediction(current_year=target_year)

Links to be extracted: 364356
364000/364356 links extracted (99.90%)

In [7]:
prediction_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
ncbigene23552-meshd000008,-0.217223,0.490787,-2.042665,-1.289988,-0.198051,0.737649,-0.384552,0.833863,-0.450686,0.26054,...,-0.039075,-0.085217,0.498679,-0.553233,0.199341,-1.953522,-0.677114,-1.073103,-0.751753,0.823563
ncbigene23552-meshd000069293,0.290043,0.654584,-2.288178,-1.489262,-0.227077,-0.910877,-0.53734,1.470565,-0.556857,0.665697,...,0.030262,-0.283573,0.549824,-0.394967,-0.043532,-3.293932,-1.72904,-1.077649,-0.557444,2.081578
ncbigene23552-meshd000069584,0.424299,0.384561,-1.326206,0.073784,0.326066,0.827036,-0.496367,0.464412,-0.134337,1.297405,...,-0.125096,0.475281,0.204715,-0.084721,0.588929,-1.249988,-1.250404,-0.913004,0.193563,0.454355
ncbigene23552-meshd000070779,0.090112,-0.040345,-2.070226,-1.873836,-0.48652,-0.346408,-0.477409,0.836036,-0.600439,0.62732,...,-0.527004,-0.199858,0.503023,-0.272177,-0.16627,-2.409542,-1.257589,-1.238372,0.099822,0.787023
ncbigene23552-meshd000071380,-0.178663,0.489075,-2.540536,-2.032222,0.162103,-0.53837,-0.685261,1.475151,-1.077893,0.010556,...,-0.867509,-0.685637,0.735764,-0.759815,-0.200893,-2.313584,-1.863076,-1.145918,-0.961654,1.536148


In [2]:
positive_pickle_path = "pos_train_vectors2010.pkl"
negative_pickle_path = "neg_train_vectors2010.pkl"
pred_pickle_path = "pred_vectors2010.pkl"
if os.path.isfile(positive_pickle_path) and os.path.isfile(negative_pickle_path) and os.path.isfile(pred_pickle_path):
    print("loading vectors from file")
    with open(positive_pickle_path, "rb") as f:
         positive_training_df = pickle.load(f)
    with open(negative_pickle_path, "rb") as f:
        negative_training_df = pickle.load(f)
    with open(pred_pickle_path, "rb") as f:
        prediction_df = pickle.load(f)
else:
    print("storing vectors to file")
    positive_training_df.to_pickle(positive_pickle_path)
    negative_training_df.to_pickle(negative_pickle_path)
    prediction_df.to_pickle(pred_pickle_path)

loading vectors from file


# Random forest classification
Use all available information to get a classifier from the 2010 data

In [3]:
X_train = positive_training_df.append(negative_training_df)
print("[INFO] positive (n=%d) and negative (n=%d) examples" %  (len(positive_training_df), len(negative_training_df)))
label_1 = np.ones(positive_training_df.shape[0])
label_0 = np.zeros(negative_training_df.shape[0])
y_train = np.concatenate((label_1,label_0))
print("Total training labels: %d" % len(y_train))

[INFO] positive (n=538) and negative (n=5380) examples
Total training labels: 5918


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

n_estimators = [100, 200, 300, 400, 500]
max_features = ['auto', 'sqrt']
max_depth = [10, 20, 30, 40, 50, None]
min_samples_split = [2, 3, 5, 7, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 1, cv = 10, random_state=42)

rf_random.fit(X_train,y_train)

best_model = rf_random.best_estimator_

In [5]:
y_pred_train = best_model.predict(X_train)
y_prob_train = best_model.predict_proba(X_train)[::,1]
fpr, tpr, thresholds = roc_curve(y_train, y_prob_train)
from numpy import sqrt,argmax
gmeans = sqrt(tpr * (1-fpr))
ix = argmax(gmeans)
opt_thres = thresholds[ix]
print('Best threshold=%f' % opt_thres)

Best threshold=0.372797


In [6]:
y_pred = best_model.predict(prediction_df)
y_prob = best_model.predict_proba(prediction_df)[::,1]
from kcet import KcetParser
kcetParser = KcetParser()
predictions = kcetParser.decode_predictions(vectors=prediction_df, probabilities=y_prob, deleteEmbeddings=True)
predictions.head()

Unnamed: 0,gene_symbol,cancer,probability
ncbigene2260-meshd008175,FGFR1,Lung Neoplasms,0.97
ncbigene2065-meshd008175,ERBB3,Lung Neoplasms,0.965
ncbigene8805-meshd002289,TRIM24,"Carcinoma, Non-Small-Cell Lung",0.96
ncbigene5156-meshd008639,PDGFRA,Mesenteric Cyst,0.95
ncbigene3791-meshd008639,KDR,Mesenteric Cyst,0.945


# Neurotrophic Tyrosine Receptor Kinase gene family
Manual inspection of the predictions from the 2010 data reveal multiple interactions between members of the Neurotrophic Tyrosine Receptor Kinase gene family (NTRK1, NTRK2, NTRK3) and several forms of cancer (See manuscript for details).
renal cell carcinoma (NTRK1, NTRK2, NTRK3), hepatocellular carcinoma (NTRK1, NTRK3), breast neoplasms (NTRK1, NTRK3), Lung Neoplasms (NTRK1), and NTRK1 - Gastrointestinal Neoplasms (NTRK1) in addition to predictions for Leukemia (NTRK1, NTRK3), whereby the prediction probability is shown after the gene.

In [10]:
pred100 = predictions.head(100)
ntrk = pred100.gene_symbol.str.contains('^NTRK')

In [11]:
ntrk_preds = pred100[ntrk]
print(len(ntrk_preds))
pred100[pred100['probability'] > opt_thres]

0


Unnamed: 0,gene_symbol,cancer,probability
ncbigene2260-meshd008175,FGFR1,Lung Neoplasms,0.970
ncbigene2065-meshd008175,ERBB3,Lung Neoplasms,0.965
ncbigene8805-meshd002289,TRIM24,"Carcinoma, Non-Small-Cell Lung",0.960
ncbigene5156-meshd008639,PDGFRA,Mesenteric Cyst,0.950
ncbigene3791-meshd008639,KDR,Mesenteric Cyst,0.945
...,...,...,...
ncbigene56924-meshd002289,PAK6,"Carcinoma, Non-Small-Cell Lung",0.870
ncbigene7046-meshd008175,TGFBR1,Lung Neoplasms,0.870
ncbigene10188-meshd002289,TNK2,"Carcinoma, Non-Small-Cell Lung",0.870
ncbigene6259-meshd002289,RYK,"Carcinoma, Non-Small-Cell Lung",0.870


In [12]:
percentage = len(ntrk_preds)*100 / len(pred100)
print(percentage)

0.0


In [36]:
negative_training_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
ncbigene5681-meshd000237,0.678124,-0.049444,0.445158,0.254593,-0.363792,0.823899,1.196874,-0.328953,0.845313,0.311532,...,0.016563,1.000482,-0.125399,1.209987,0.929905,1.041014,0.917641,-0.930744,1.10056,-0.527321
ncbigene10155-meshd007952,0.473492,-0.246558,-0.272521,-0.709817,-0.304952,-0.3792,-0.379449,0.393178,-0.237118,0.220109,...,0.373258,-0.082442,-0.157558,-0.204445,-0.362843,-0.449895,-0.02228,-1.001432,0.198821,0.815863
ncbigene5587-meshd049309,0.86743,-0.089011,-1.687493,-1.509478,-0.053796,-0.245332,-0.494862,0.33299,-0.923994,-0.044364,...,-0.773408,-0.429434,-0.012806,-1.194564,-0.991906,-0.477748,-0.709604,-1.046854,-0.282202,0.398167
ncbigene558-meshd020863,2.087421,-0.388455,0.517079,-1.466516,-0.988681,0.658571,-0.556,0.192118,-0.207558,-1.147877,...,-0.510682,1.270222,0.030799,0.882645,0.162313,1.62813,-0.663849,-0.981421,1.291975,-0.88339
ncbigene51755-meshd005706,-0.053386,-1.437351,0.842594,1.241333,-1.27329,3.748821,0.497103,-0.721069,3.7224,0.665423,...,1.370118,2.193417,-1.810255,2.780808,2.017361,1.479193,0.963892,-0.830117,0.332109,-0.214187


In [37]:
prediction_df.head()

Unnamed: 0,gene_symbol,cancer,probability,gene_symbol.1,cancer.1,probability.1,gene_symbol.2,cancer.2,probability.2,gene_symbol.3,...,90,91,92,93,94,95,96,97,98,99
ncbigene23552-meshd000008,CDK20,Abdominal Neoplasms,0.005875,CDK20,Abdominal Neoplasms,0.005875,CDK20,Abdominal Neoplasms,0.005875,CDK20,...,-0.039075,-0.085217,0.498679,-0.553233,0.199341,-1.953522,-0.677114,-1.073103,-0.751753,0.823563
ncbigene23552-meshd000069293,CDK20,Plasmablastic Lymphoma,0.002543,CDK20,Plasmablastic Lymphoma,0.002543,CDK20,Plasmablastic Lymphoma,0.002543,CDK20,...,0.030262,-0.283573,0.549824,-0.394967,-0.043532,-3.293932,-1.72904,-1.077649,-0.557444,2.081578
ncbigene23552-meshd000069584,CDK20,Unilateral Breast Neoplasms,0.03167,CDK20,Unilateral Breast Neoplasms,0.03167,CDK20,Unilateral Breast Neoplasms,0.03167,CDK20,...,-0.125096,0.475281,0.204715,-0.084721,0.588929,-1.249988,-1.250404,-0.913004,0.193563,0.454355
ncbigene23552-meshd000070779,CDK20,Giant Cell Tumor of Tendon Sheath,0.007254,CDK20,Giant Cell Tumor of Tendon Sheath,0.007254,CDK20,Giant Cell Tumor of Tendon Sheath,0.007254,CDK20,...,-0.527004,-0.199858,0.503023,-0.272177,-0.16627,-2.409542,-1.257589,-1.238372,0.099822,0.787023
ncbigene23552-meshd000071380,CDK20,"Fibromatosis, Plantar",0.00086,CDK20,"Fibromatosis, Plantar",0.00086,CDK20,"Fibromatosis, Plantar",0.00086,CDK20,...,-0.867509,-0.685637,0.735764,-0.759815,-0.200893,-2.313584,-1.863076,-1.145918,-0.961654,1.536148
