In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime
import numpy as np
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

# Modeling for link prediction

This notebook is built on top of the *ndl_dataprep_dataloader.ipynb*.

The aim of the notebook to use Machine Learning approach based on node and graph features to predict edges in a graph.

## Datapreparation

In [2]:
train_edge_df = pd.read_csv("../data/train_edge_df.csv")
val_edge_df = pd.read_csv("../data/val_edge_df.csv")
test_edge_df = pd.read_csv("../data/test_edge_df.csv")

In [3]:
def changeToDatetime(df, col):
    df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S.%f').map(datetime.timestamp)
    
changeToDatetime(train_edge_df, "UserCreatedAt")
changeToDatetime(train_edge_df, "ArticlePublishingDate")
changeToDatetime(val_edge_df, "UserCreatedAt")
changeToDatetime(val_edge_df, "ArticlePublishingDate")
changeToDatetime(test_edge_df, "UserCreatedAt")
changeToDatetime(test_edge_df, "ArticlePublishingDate")

In [4]:
train_edge_df["TrainTestVal"] = "train"
val_edge_df["TrainTestVal"] = "val"
test_edge_df["TrainTestVal"] = "test"
total_df = pd.concat([train_edge_df, val_edge_df, test_edge_df])
total_df.head()

Unnamed: 0,user,article,Existing_edge,UserCreatedAt,centrality_user,degree_user,ArticlePublishingDate,ArticleChannel,ArticleRessortName,centrality_article,degree_article,TrainTestVal
0,u-680585,a-2000104039148,True,1537714000.0,0.00226,86,1559133000.0,International,Großbritannien,0.015058,573,train
1,u-531534,a-2000102849817,True,1298671000.0,0.000447,17,1557465000.0,Wirtschaft,Unternehmen,0.00862,328,train
2,u-180735,a-2000104009137,True,978874300.0,0.001077,41,1559131000.0,Meinung,Kommentare Pro und Kontra,0.057998,2207,train
3,u-519767,a-2000103849476,True,1373657000.0,0.003574,136,1558889000.0,International,Deutschland,0.022653,862,train
4,u-676496,a-2000104084140,True,1531130000.0,0.000972,37,1559212000.0,Web,Netzpolitik,0.047907,1823,train


In [5]:
# one hot encode the categorical variables
total_df = pd.get_dummies(total_df, columns=["ArticleChannel", "ArticleRessortName"])

In [6]:
train_edge_df = total_df[total_df["TrainTestVal"] == "train"]
val_edge_df = total_df[total_df["TrainTestVal"] == "val"]
test_edge_df = total_df[total_df["TrainTestVal"] == "test"]

In [8]:
X_train = train_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal"], axis=1)
y_train = train_edge_df["Existing_edge"]

X_val = val_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal"], axis=1)
y_val = val_edge_df["Existing_edge"]

X_test = test_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal"], axis=1)
y_test = test_edge_df["Existing_edge"]

## RandomForestClassifier using all the available features

In [10]:
# model definition

rf = RandomForestClassifier(n_estimators=100, max_depth=7)

rf.fit(X_train, y_train)

In [11]:
# The error was after training 
y_pred_train = rf.predict(X_train)
y_pred_val = rf.predict(X_val)
y_pred_test = rf.predict(X_test)


In [13]:
results_all = pd.DataFrame({
    "Train": [recall_score(y_train, y_pred_train), precision_score(y_train, y_pred_train), accuracy_score(y_train, y_pred_train), f1_score(y_train, y_pred_train)],
    "Validation": [recall_score(y_val, y_pred_val), precision_score(y_val, y_pred_val), accuracy_score(y_val, y_pred_val), f1_score(y_val, y_pred_val)],
    "Test": [recall_score(y_test, y_pred_test), precision_score(y_test, y_pred_test), accuracy_score(y_test, y_pred_test), f1_score(y_test, y_pred_test)]
}, index = ["recall", "precision", "accuracy", "f1"])
results_all

Unnamed: 0,Train,Validation,Test
recall,0.818847,0.818588,0.817828
precision,0.760589,0.723567,0.725899
accuracy,0.780549,0.752927,0.754507
f1,0.788644,0.76815,0.769127


In [19]:
results_all.to_csv("../data/results_all.csv")

### Fine tuning the parameters using GridSearch

In [22]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 7, 9],
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='f1', cv=cv)
grid_search.fit(X_train, y_train)

In [26]:
best_n_estimators = grid_search.best_params_["n_estimators"]
best_max_depth = grid_search.best_params_["max_depth"]
print(f"The best n_estimators is {best_n_estimators} and the best max_depth is {best_max_depth}")

The best n_estimators is 100 and the best max_depth is 9


The GridSearch took over 135 minutes and the tuned parameters are the following: the n_estimator stayed the default 100 and the max_depth is is9.

In [24]:
rf_tuned = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth)

In [27]:
rf_tuned.fit(X_train, y_train)

In [28]:
# The error was after training 
y_pred_train_tuned = rf_tuned.predict(X_train)
y_pred_val_tuned = rf_tuned.predict(X_val)
y_pred_test_tuned = rf_tuned.predict(X_test)

In [29]:
results_tuned = pd.DataFrame({
    "Train": [recall_score(y_train, y_pred_train_tuned), precision_score(y_train, y_pred_train_tuned), accuracy_score(y_train, y_pred_train_tuned), f1_score(y_train, y_pred_train_tuned)],
    "Validation": [recall_score(y_val, y_pred_val_tuned), precision_score(y_val, y_pred_val_tuned), accuracy_score(y_val, y_pred_val_tuned), f1_score(y_val, y_pred_val_tuned)],
    "Test": [recall_score(y_test, y_pred_test_tuned), precision_score(y_test, y_pred_test_tuned), accuracy_score(y_test, y_pred_test_tuned), f1_score(y_test, y_pred_test_tuned)]
}, index = ["recall", "precision", "accuracy", "f1"])
results_tuned

Unnamed: 0,Train,Validation,Test
recall,0.819738,0.819482,0.818689
precision,0.760032,0.723093,0.725375
accuracy,0.780459,0.752832,0.754368
f1,0.788756,0.768276,0.769212


In [39]:
results_tuned.to_csv("../data/results_tuned.csv")

## RandomForestClassifier restricted features

In the previous predictions the degree and the centraility of the nodes were used. However, these graph metrics were calculated on a graph, that was built using not just the train, but also the validation and test datasets. Therefore some sort of data leakage might effects the previous results.

Therefore in the next step we would like to consider the option of ignoring the information about degree and centraility of the nodes.

In [30]:
X_train_restricted = train_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal", "degree_user", "centrality_user", "degree_article", "centrality_article"], axis=1)
y_train = train_edge_df["Existing_edge"]

X_val_restricted = val_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal", "degree_user", "centrality_user", "degree_article", "centrality_article"], axis=1)
y_val = val_edge_df["Existing_edge"]

X_test_restricted = test_edge_df.drop(["Existing_edge", "user", "article", "TrainTestVal", "degree_user", "centrality_user", "degree_article", "centrality_article"], axis=1)
y_test = test_edge_df["Existing_edge"]

In [31]:
# model definition

rf_restricted = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth)

rf_restricted.fit(X_train_restricted, y_train)

In [32]:
# The error was after training 
y_pred_train_restricted = rf_restricted.predict(X_train_restricted)
y_pred_val_restricted = rf_restricted.predict(X_val_restricted)
y_pred_test_restricted = rf_restricted.predict(X_test_restricted)

In [33]:
results_restricted = pd.DataFrame({
    "Train": [recall_score(y_train, y_pred_train_restricted), precision_score(y_train, y_pred_train_restricted), accuracy_score(y_train, y_pred_train_restricted), f1_score(y_train, y_pred_train_restricted)],
    "Validation": [recall_score(y_val, y_pred_val_restricted), precision_score(y_val, y_pred_val_restricted), accuracy_score(y_val, y_pred_val_restricted), f1_score(y_val, y_pred_val_restricted)],
    "Test": [recall_score(y_test, y_pred_test_restricted), precision_score(y_test, y_pred_test_restricted), accuracy_score(y_test, y_pred_test_restricted), f1_score(y_test, y_pred_test_restricted)]
}, index = ["recall", "precision", "accuracy", "f1"])
results_restricted

Unnamed: 0,Train,Validation,Test
recall,0.503064,0.503697,0.504558
precision,0.712486,0.682525,0.68635
accuracy,0.650029,0.634702,0.636992
f1,0.589734,0.579632,0.581578


In [34]:
results_restricted.to_csv("../data/results_restricted.csv")

## SupportVectorMaschine
Finally, we would like to test the usage of an alternative model, namely the SVM

In [38]:
param_grid_random = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.1, 0.01, 0.001],
}

# Create an SVC instance
svc = SVC()

# Create a StratifiedKFold cross-validator (you can adjust the number of folds as needed)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create a GridSearchCV object
random_search = RandomizedSearchCV(estimator=svc, param_distributions=param_grid_random, scoring='f1', cv=cv)

# Fit the grid search to the data
random_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
best_C = random_search.best_params_["C"]
best_kernel = random_search.best_params_["kernel"]
best_gamma = random_search.best_params_["gamma"]
print(f"The best C is {best_C}, the best kernel is {best_kernel} and the best gamma is {best_gamma}")

In [10]:
# Fit the SCV model using the best hyperparameters found
#scv_tuned = SVC(C=best_C, kernel=best_kernel, gamma=best_gamma)
scv_tuned = SVC()
scv_tuned.fit(X_train, y_train)

MemoryError: Unable to allocate 7.65 GiB for an array with shape (717, 1432312) and data type float64

In [None]:
# The error was after training 
y_pred_train_svc = scv_tuned.predict(X_train)
y_pred_val_svc = scv_tuned.predict(X_val)
y_pred_test_svc = scv_tuned.predict(X_test)

In [None]:
results_svc = pd.DataFrame({
    "Train": [recall_score(y_train, y_pred_train_svc), precision_score(y_train, y_pred_train_svc), accuracy_score(y_train, y_pred_train_svc), f1_score(y_train, y_pred_train_svc)],
    "Validation": [recall_score(y_val, y_pred_val_svc), precision_score(y_val, y_pred_val_svc), accuracy_score(y_val, y_pred_val_svc), f1_score(y_val, y_pred_val_svc)],
    "Test": [recall_score(y_test, y_pred_test_svc), precision_score(y_test, y_pred_test_svc), accuracy_score(y_test, y_pred_test_svc), f1_score(y_test, y_pred_test_svc)]
}, index = ["recall", "precision", "accuracy", "f1"])

In [None]:
results_svc.to_csv("../data/results_svc.csv")
results_svc