# Random forest predictions
This notebook ingests the positive and negative training vectors as well as the prediction set as generated by the notebook ``ExtractDifferenceVectors.ipynb``. It then perfoms Random forest learning and ranks the prediction set.

In [None]:
import pandas as pd
import os
import sys
import numpy as np
sys.path.insert(0, os.path.abspath('..'))
from kcet import KcetParser
from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn import metrics

In [None]:
data_directory = 'data'
if not os.path.isdir(data_directory):
    raise FileNotFoundError("Could not find data directory")
positive_validation_pickle_path = os.path.join(data_directory, "positive-valid-vectors.pkl")
negative_validation_pickle_path = os.path.join(data_directory, "negative-valid-vectors.pkl")
positive_train_pickle_path = os.path.join(data_directory, "positive-train-vectors.pkl")
negative_train_pickle_path = os.path.join(data_directory, "negative-train-vectors.pkl")
diff_vectors_pos_validation = pd.read_pickle(positive_validation_pickle_path)
diff_vectors_neg_validation = pd.read_pickle(negative_validation_pickle_path)
diff_vectors_pos_training = pd.read_pickle(positive_train_pickle_path)
diff_vectors_neg_training = pd.read_pickle(negative_train_pickle_path)

In [None]:
diff_vectors_pos_validation.head()

In [None]:
diff_vectors_pos_validation.shape

In [None]:
diff_vectors_neg_validation.head()

In [None]:
diff_vectors_pos_training.head()

In [None]:
diff_vectors_neg_training.head()

# Random Forest
### 1. Training set
Create the training set by concatenating ``diff_vectors_pos`` and ``diff_vectors_neg``.

In [None]:
X_train = pd.concat([diff_vectors_pos_training,diff_vectors_neg_training])
print("Total training vectors: %d" % len(X_train))

In [None]:
label_1 = np.ones(diff_vectors_pos_training.shape[0])
label_0 = np.zeros(diff_vectors_neg_training.shape[0])
y_train = np.concatenate((label_1,label_0))
print("Total training labels: %d" % len(y_train))

### 2. Test set. 

In [None]:
X_test = pd.concat([diff_vectors_pos_validation,diff_vectors_neg_validation])
print("Total test vectors: %d" % len(X_test))

In [None]:
label_1 = np.ones(diff_vectors_pos_validation.shape[0])
label_0 = np.zeros(diff_vectors_neg_validation.shape[0])
y_test = np.concatenate((label_1,label_0))
print("Total test labels: %d" % len(y_test))

In [None]:
X_test.head()

## Hyperparameter tuning the random forest.

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

## Search over the parameters to choose the best model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20, cv = 10, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X_train,y_train)

best_model = rf_random.best_estimator_

In [None]:
rf_random.best_params_


# Link prediction

In [None]:
y_pred = best_model.predict(X_test)
yproba = best_model.predict_proba(X_test)[::,1]
#y_pred

In [None]:
print(metrics.classification_report(y_test, y_pred))

In [None]:
metrics.confusion_matrix(y_test,y_pred)

In [None]:
fpr, tpr, _ = roc_curve(y_test,  yproba)
auc_test = roc_auc_score(y_test, yproba)

In [None]:
auc_test

In [None]:
metrics.plot_roc_curve(best_model, X_test, y_test)  

In [None]:
metrics.plot_precision_recall_curve(best_model, X_test, y_test)

## Decoding results
The ``KcetParser`` class has methods that take the raw X_test vectors and create an annotated dataframe by
decoding strings like ``ncbigene5599-meshd000074723`` to show the corresponding gene symbols and MeSH labels (neoplasms),
and also placing the probabilities of the predictions in the corresponding rows. The resulting dataframe
is sorted according to probability. The ``deleteEmbeddings`` argument determines whether we only return the
three columns ``gene_symbol1``, ``cancer``, and ``probability``.

In [None]:
from kcet import KcetParser
kcetParser = KcetParser()
predictions = kcetParser.decode_predictions(vectors=X_test, probabilities=yproba, deleteEmbeddings=True)
predictions.head()

In [None]:
print("Total number of predictions:", len(predictions))

In [None]:
predictions.to_csv("predictions_2015.tsv",index=False,sep="\t")