# Random forest predictions
This notebook ingests the positive and negative training vectors as well as the prediction set as generated by the notebook ``ExtractDifferenceVectors.ipynb``. It then perfoms Random forest learning and ranks the prediction set.

In [None]:
import pandas as pd
import os
import sys
import numpy as np
sys.path.insert(0, os.path.abspath('../..'))
from kcet import KcetParser
import pickle5 as pickle


In [None]:

prediction_pickle_path = "predictions.pkl"
positive_diff_pickle_path =  "positive-vectors.pkl"
negative_diff_pickle_path =  "negative-vectors.pkl"
with open(prediction_pickle_path, "rb") as fpred:
     diff_vectors_prediction = pickle.load(fpred)
with open(positive_diff_pickle_path, "rb") as fpos:        
    diff_vectors_pos = pickle.load(fpos)
with open(negative_diff_pickle_path, "rb") as fneg:    
    diff_vectors_neg = pickle.load(fneg)

In [None]:
diff_vectors_prediction.head()

In [None]:
diff_vectors_pos.head()

In [None]:
diff_vectors_neg.head()

# Random Forest
### 1. Training set
Create the training set by concatenating ``diff_vectors_pos`` and ``diff_vectors_neg``.

In [None]:
X_train = pd.concat([diff_vectors_pos,diff_vectors_neg])
print("Total training vectors: %d" % len(X_train))

In [None]:
label_1 = np.ones(diff_vectors_pos.shape[0])
label_0 = np.zeros(diff_vectors_neg.shape[0])
y_train = np.concatenate((label_1,label_0))
print("Total training labels: %d" % len(y_train))

### 2. Test set. 
The test set is the prediction set with one label (either 0 or 1)

In [None]:
X_test = diff_vectors_prediction
label_test = np.ones(diff_vectors_prediction.shape[0])
y_test = label_test

In [None]:
X_test.head()

## Hyperparameter tuning the random forest.

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

## Search over the parameters to choose the best model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 10, verbose=2, random_state=42)

rf_random.fit(X_train,y_train)

best_model = rf_random.best_estimator_

In [None]:
rf_random.best_params_


# Link prediction

In [None]:
y_pred = best_model.predict(X_test)
yproba = best_model.predict_proba(X_test)[::,1]

## Decoding results
The ``KcetParser`` class has methods that take the raw X_test vectors and create an annotated dataframe by
decoding strings like ``ncbigene5599-meshd000074723`` to show the corresponding gene symbols and MeSH labels (neoplasms),
and also placing the probabilities of the predictions in the corresponding rows. The resulting dataframe
is sorted according to probability. The ``deleteEmbeddings`` argument determines whether we only return the
three columns ``gene_symbol1``, ``cancer``, and ``probability``.

In [None]:
from kcet import KcetParser
kcetParser = KcetParser()
predictions = kcetParser.decode_predictions(vectors=X_test, probabilities=yproba, deleteEmbeddings=True)
predictions.head(n=20)

In [None]:
print("Total number of predictions:", len(predictions))

In [None]:
predictions.to_csv("predictions_novel.tsv",index=False,sep="\t")

## Probability distributions of positive and negative examples
Here, we plot the distributions of the probabiliies of the positive and negative examples as calculated by the random forest.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  
import seaborn as sns

We concatenated the vectors as follows. ``X_train = pd.concat([diff_vectors_pos,diff_vectors_neg])``
Therefore, we can extract the individual predictions as follows

In [None]:
pos_probs = []
neg_probs = []
n_pos = len(diff_vectors_pos)
n_predictions = len(predictions)
print("[INFO] Extracting %d positive predictions from a total of %d" % (n_pos, n_predictions))

In [None]:
for i in range(n_predictions):
    row = predictions.iloc[i]
    pr = float(row['probability'])
    if i < n_pos:
        pos_probs.append(pr)
    else:
        neg_probs.append(pr)
# sanity check
print("[INFO] Got %d positive and %d negative predictions" % (len(pos_probs), len(neg_probs)))

In [None]:

posnp = np.array(pos_probs)
negnp = np.array(neg_probs)
p1=sns.kdeplot(data=posnp, shade=True, color="r")
p1=sns.kdeplot(data=negnp, shade=True, color="b")



The positive examples are shown in red and the negative examples in blue. The separation is of course
an expected result. 
We can calculate some values to estimate some threshold probabilities for predictions.

In [None]:
print("Minimum prob, positive group:", np.min(posnp))
print("Maximum prob, positive group:", np.max(posnp))
print("prob at 1st percentile, positive group:", np.percentile(posnp, 1))
print("prob at 5th percentile, positive group:", np.percentile(posnp, 5))
print("prob at 10th percentile, positive group:", np.percentile(posnp, 20))
print("prob at 20th percentile, positive group:", np.percentile(posnp, 20))
print()
print("Minimum prob, negative group:", np.min(negnp))
print("Maximum prob, negative group:", np.max(negnp))
print("prob at 99th percentile, negative group:", np.percentile(negnp, 99))
print("prob at 95th percentile, negative group:", np.percentile(negnp, 95))
print("prob at 90th percentile, negative group:", np.percentile(negnp, 90))
print("prob at 80th percentile, negative group:", np.percentile(negnp, 80))

## Probabiity distribution of prediction scores:

In [None]:
probility = []
for i in range(n_predictions):
    row = predictions.iloc[i]
    pr = float(row['probability'])
    probility.append(pr)

In [None]:
scores = np.array(probility)
import pandas as pd
x = pd.Series(scores, name="Prediction score")
#p=sns.kdeplot(data=scores, x="d", shade=True, color="b")
ax = sns.kdeplot(x,shade=True,color="b")