# Random Forest for link prediction

This notebook is built on top of the *ndl_dataprep_dataloader.ipynb*.

The aim of the notebook to use Machine Learning approach based on node and graph features to predict edges in a graph.

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from datetime import datetime
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

## Datapreparation

In [2]:
train_edge_df = pd.read_csv("../data/train_edge_df.csv")
val_edge_df = pd.read_csv("../data/val_edge_df.csv")
test_edge_df = pd.read_csv("../data/test_edge_df.csv")

In [3]:
def changeToDatetime(df, col):
    df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S.%f').map(datetime.timestamp)
    
changeToDatetime(train_edge_df, "UserCreatedAt")
changeToDatetime(train_edge_df, "ArticlePublishingDate")
changeToDatetime(val_edge_df, "UserCreatedAt")
changeToDatetime(val_edge_df, "ArticlePublishingDate")
changeToDatetime(test_edge_df, "UserCreatedAt")
changeToDatetime(test_edge_df, "ArticlePublishingDate")

In [4]:
train_edge_df["TrainTestVal"] = "train"
val_edge_df["TrainTestVal"] = "val"
test_edge_df["TrainTestVal"] = "test"
total_df = pd.concat([train_edge_df, val_edge_df, test_edge_df])
total_df.head()

Unnamed: 0,user,article,Existing_edge,UserCreatedAt,centrality_user,degree_user,ArticlePublishingDate,ArticleChannel,ArticleRessortName,centrality_article,degree_article,TrainTestVal
0,u-680585,a-2000104039148,True,1537714000.0,0.00226,86,1559133000.0,International,Großbritannien,0.015058,573,train
1,u-531534,a-2000102849817,True,1298671000.0,0.000447,17,1557465000.0,Wirtschaft,Unternehmen,0.00862,328,train
2,u-180735,a-2000104009137,True,978874300.0,0.001077,41,1559131000.0,Meinung,Kommentare Pro und Kontra,0.057998,2207,train
3,u-519767,a-2000103849476,True,1373657000.0,0.003574,136,1558889000.0,International,Deutschland,0.022653,862,train
4,u-676496,a-2000104084140,True,1531130000.0,0.000972,37,1559212000.0,Web,Netzpolitik,0.047907,1823,train


In [5]:
# one hot encode the categorical variables
total_df = pd.get_dummies(total_df, columns=["ArticleChannel", "ArticleRessortName"])

In [6]:
train_edge_df = total_df[total_df["TrainTestVal"] == "train"]
val_edge_df = total_df[total_df["TrainTestVal"] == "val"]
test_edge_df = total_df[total_df["TrainTestVal"] == "test"]

In [7]:
X_train = train_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal"], axis=1)
y_train = train_edge_df["Existing_edge"]

X_val = val_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal"], axis=1)
y_val = val_edge_df["Existing_edge"]

X_test = test_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal"], axis=1)
y_test = test_edge_df["Existing_edge"]

## RandomForestClassifier using all the available features

This model is suited to predict future links within the current graph. It is not very well suited for newly added data, such as new articles or users.

In [18]:
# model definition

rf = RandomForestClassifier(n_estimators=100, max_depth=7)

rf.fit(X_train, y_train)

In [19]:
# The error was after training 
y_pred_train = rf.predict(X_train)
y_pred_val = rf.predict(X_val)
y_pred_test = rf.predict(X_test)


In [20]:
results_all = pd.DataFrame({
    "Train": [recall_score(y_train, y_pred_train), precision_score(y_train, y_pred_train), accuracy_score(y_train, y_pred_train), f1_score(y_train, y_pred_train), roc_auc_score(y_train, y_pred_train)],
    "Validation": [recall_score(y_val, y_pred_val), precision_score(y_val, y_pred_val), accuracy_score(y_val, y_pred_val), f1_score(y_val, y_pred_val), roc_auc_score(y_val, y_pred_val)],
    "Test": [recall_score(y_test, y_pred_test), precision_score(y_test, y_pred_test), accuracy_score(y_test, y_pred_test), f1_score(y_test, y_pred_test), roc_auc_score(y_test, y_pred_test)]
}, index = ["recall", "precision", "accuracy", "f1", "roc_auc"])
results_all

Unnamed: 0,Train,Validation,Test
recall,0.818047,0.817717,0.817013
precision,0.760904,0.723991,0.726342
accuracy,0.780498,0.752988,0.754597
f1,0.788441,0.768005,0.769014
roc_auc,0.780498,0.752988,0.754597


In [27]:
results_all.to_csv("../data/results_all.csv")

### Fine tuning the parameters using GridSearch

In [22]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 7, 9],
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='f1', cv=cv)
grid_search.fit(X_train, y_train)

In [26]:
best_n_estimators = grid_search.best_params_["n_estimators"]
best_max_depth = grid_search.best_params_["max_depth"]
print(f"The best n_estimators is {best_n_estimators} and the best max_depth is {best_max_depth}")

The best n_estimators is 100 and the best max_depth is 9


The GridSearch took over 135 minutes and the tuned parameters are the following: the n_estimator stayed the default 100 and the max_depth is is9.

In [21]:
rf_tuned = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth)

In [22]:
rf_tuned.fit(X_train, y_train)

In [23]:
# The error was after training 
y_pred_train_tuned = rf_tuned.predict(X_train)
y_pred_val_tuned = rf_tuned.predict(X_val)
y_pred_test_tuned = rf_tuned.predict(X_test)

In [25]:
results_tuned = pd.DataFrame({
    "Train": [recall_score(y_train, y_pred_train_tuned), precision_score(y_train, y_pred_train_tuned), accuracy_score(y_train, y_pred_train_tuned), f1_score(y_train, y_pred_train_tuned), roc_auc_score(y_train, y_pred_train_tuned)],
    "Validation": [recall_score(y_val, y_pred_val_tuned), precision_score(y_val, y_pred_val_tuned), accuracy_score(y_val, y_pred_val_tuned), f1_score(y_val, y_pred_val_tuned), roc_auc_score(y_val, y_pred_val_tuned)],
    "Test": [recall_score(y_test, y_pred_test_tuned), precision_score(y_test, y_pred_test_tuned), accuracy_score(y_test, y_pred_test_tuned), f1_score(y_test, y_pred_test_tuned), roc_auc_score(y_test, y_pred_test_tuned)]
}, index = ["recall", "precision", "accuracy", "f1", "roc_auc"])
results_tuned

Unnamed: 0,Train,Validation,Test
recall,0.818773,0.818599,0.817817
precision,0.760578,0.723599,0.725976
accuracy,0.780516,0.752955,0.754563
f1,0.788603,0.768173,0.769164
roc_auc,0.780516,0.752955,0.754563


In [26]:
results_tuned.to_csv("../data/results_tuned.csv")

## RandomForestClassifier restricted features

In the previous predictions the degree and the centraility of the nodes were used. However, these graph metrics were calculated on a graph, that was built using not just the train, but also the validation and test datasets. Therefore, some sort of data leakage might effects the previous results.

Therefore, in the next step we would like to consider the option of ignoring the information about degree and centraility of the nodes.

By restricting the features, we also aim to create a model, that is also able to handle new added datapoint, vertices in the data. This corresponds to a task, when a new article is written or a new user registers.


In [8]:
X_train_restricted = train_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal", "degree_user", "centrality_user", "degree_article", "centrality_article"], axis=1)
y_train = train_edge_df["Existing_edge"]

X_val_restricted = val_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal", "degree_user", "centrality_user", "degree_article", "centrality_article"], axis=1)
y_val = val_edge_df["Existing_edge"]

X_test_restricted = test_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal", "degree_user", "centrality_user", "degree_article", "centrality_article"], axis=1)
y_test = test_edge_df["Existing_edge"]

In [9]:
# model definition

rf_restricted = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth)

rf_restricted.fit(X_train_restricted, y_train)

In [10]:
# The error was after training 
y_pred_train_restricted = rf_restricted.predict(X_train_restricted)
y_pred_val_restricted = rf_restricted.predict(X_val_restricted)
y_pred_test_restricted = rf_restricted.predict(X_test_restricted)

In [12]:
results_restricted = pd.DataFrame({
    "Train": [recall_score(y_train, y_pred_train_restricted), precision_score(y_train, y_pred_train_restricted), accuracy_score(y_train, y_pred_train_restricted), f1_score(y_train, y_pred_train_restricted), roc_auc_score(y_train, y_pred_train_restricted)],
    "Validation": [recall_score(y_val, y_pred_val_restricted), precision_score(y_val, y_pred_val_restricted), accuracy_score(y_val, y_pred_val_restricted), f1_score(y_val, y_pred_val_restricted), roc_auc_score(y_val, y_pred_val_restricted)],
    "Test": [recall_score(y_test, y_pred_test_restricted), precision_score(y_test, y_pred_test_restricted), accuracy_score(y_test, y_pred_test_restricted), f1_score(y_test, y_pred_test_restricted), roc_auc_score(y_test, y_pred_test_restricted)]
}, index = ["recall", "precision", "accuracy", "f1", "roc_auc"])
results_restricted

Unnamed: 0,Train,Validation,Test
recall,0.496675,0.498023,0.497833
precision,0.70942,0.679774,0.683968
accuracy,0.646618,0.631708,0.633903
f1,0.584285,0.574875,0.576242
roc_auc,0.646618,0.631708,0.633903


In [13]:
results_restricted.to_csv("../data/results_restricted.csv")

## Other approaches

Finally, we aimed to realize the prediction using alternative machine learning approaches: SVM, KNN and a neural network model.
This experiments didn't conclude in a reasonable time, hence without metrics we decided the exclude these approaches.


