# Imports

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
import plotly.graph_objects as go
import numpy as np

# Load the data files

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

results = {}

# Baseline - Pairwise features & node features

## Create the train test split

In [None]:
X_train, y_train = train.drop(["link", "timestamp", "source", "destination", "graph_key", 'source_role_0', 'source_role_1','source_role_2', 'source_role_3', 'source_role_4', 'source_role_5','source_role_6', 'destination_role_0', 'destination_role_1', 'destination_role_2', 'destination_role_3','destination_role_4', 'destination_role_5', 'destination_role_6'], axis=1, inplace=False), train["link"]
X_test,  y_test = test.drop(["link", "timestamp", "source", "destination", "graph_key", 'source_role_0', 'source_role_1','source_role_2', 'source_role_3', 'source_role_4', 'source_role_5','source_role_6', 'destination_role_0', 'destination_role_1', 'destination_role_2', 'destination_role_3','destination_role_4', 'destination_role_5', 'destination_role_6'], axis=1, inplace=False), test["link"]

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predictions
y_proba = clf.predict_proba(X_test)[:, 1]
precisions, recall, threshold = precision_recall_curve(y_test, y_proba)


results["PF & NF"] = {
    'threshold': threshold,
    'recall': recall[:-1]
}

# Pairwise & node features & roles

## Create the train test split

In [None]:
X_train, y_train = train.drop(["link", "timestamp", "source", "destination", "graph_key"], axis=1, inplace=False), train["link"]
X_test,  y_test = test.drop(["link", "timestamp", "source", "destination", "graph_key"], axis=1, inplace=False), test["link"]

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predictions
y_proba = clf.predict_proba(X_test)[:, 1]
precisions, recall, threshold = precision_recall_curve(y_test, y_proba)

results["PF & NF & R"] = {
    'threshold': threshold,
    'recall': recall[:-1]
}


# Roles

## Create the train test split

In [None]:
# Drop all columns except for the roles
X_train, y_train = train.drop(train.columns.difference(['source_role_0', 'source_role_1','source_role_2', 'source_role_3', 'source_role_4', 'source_role_5','source_role_6', 'destination_role_0', 'destination_role_1', 'destination_role_2', 'destination_role_3','destination_role_4', 'destination_role_5', 'destination_role_6']), axis=1, inplace=False), train["link"]
X_test,  y_test = test.drop(train.columns.difference(['source_role_0', 'source_role_1','source_role_2', 'source_role_3', 'source_role_4', 'source_role_5','source_role_6', 'destination_role_0', 'destination_role_1', 'destination_role_2', 'destination_role_3','destination_role_4', 'destination_role_5', 'destination_role_6']), axis=1, inplace=False), test["link"]

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Predictions
y_proba = clf.predict_proba(X_test)[:, 1]
precisions, recall, threshold = precision_recall_curve(y_test, y_proba)

results["R"] = {
    'threshold': threshold,
    'recall': recall[:-1],
    'y_pred': y_pred
}

#  Pair features

## Create the train test split

In [None]:
# Drop all columns except for the roles
X_train, y_train = train.drop(train.columns.difference(['jaccard','sorensen']), axis=1, inplace=False), train["link"]
X_test,  y_test = test.drop(train.columns.difference(['jaccard','sorensen']), axis=1, inplace=False), test["link"]

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Predictions
y_proba = clf.predict_proba(X_test)[:, 1]
precisions, recall, threshold = precision_recall_curve(y_test, y_proba)

results["PF"] = {
    'threshold': threshold,
    'recall': recall[:-1]
}

#  Node features 

## Create the train test split

In [None]:
# Drop all columns except for the roles
X_train, y_train = train.drop(train.columns.difference(['source_pagerank',
       'source_indegree', 'source_outdegree', 'source_eigenvector',
       'source_avg_path_length', 'destination_pagerank',
       'destination_indegree', 'destination_outdegree',
       'destination_eigenvector', 'destination_avg_path_length',]), axis=1, inplace=False), train["link"]
X_test,  y_test = test.drop(test.columns.difference(['source_pagerank',
       'source_indegree', 'source_outdegree', 'source_eigenvector',
       'source_avg_path_length', 'destination_pagerank',
       'destination_indegree', 'destination_outdegree',
       'destination_eigenvector', 'destination_avg_path_length',]), axis=1, inplace=False), test["link"]

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Predictions
y_proba = clf.predict_proba(X_test)[:, 1]
precisions, recall, threshold = precision_recall_curve(y_test, y_proba)

results["NF"] = {
    'threshold': threshold,
    'recall': recall[:-1]
}

#  Pair features & roles

## Create the train test split

In [None]:
# Drop all columns except for the roles
X_train, y_train = train.drop(train.columns.difference(['jaccard','sorensen', 'source_role_0', 'source_role_1','source_role_2', 'source_role_3', 'source_role_4', 'source_role_5','source_role_6', 'destination_role_0', 'destination_role_1', 'destination_role_2', 'destination_role_3','destination_role_4', 'destination_role_5', 'destination_role_6']), axis=1, inplace=False), train["link"]
X_test,  y_test = test.drop(train.columns.difference(['jaccard','sorensen', 'source_role_0', 'source_role_1','source_role_2', 'source_role_3', 'source_role_4', 'source_role_5','source_role_6', 'destination_role_0', 'destination_role_1', 'destination_role_2', 'destination_role_3','destination_role_4', 'destination_role_5', 'destination_role_6']), axis=1, inplace=False), test["link"]

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Predictions
y_proba = clf.predict_proba(X_test)[:, 1]
precisions, recall, threshold = precision_recall_curve(y_test, y_proba)

results["PF + R"] = {
    'threshold': threshold,
    'recall': recall[:-1]
}


#  Node features & roles

## Create the train test split

In [None]:
X_train, y_train = train.drop(train.columns.difference(['source_pagerank',
       'source_indegree', 'source_outdegree', 'source_eigenvector',
       'source_avg_path_length', 'destination_pagerank',
       'destination_indegree', 'destination_outdegree',
       'destination_eigenvector', 'destination_avg_path_length', 'source_role_0', 'source_role_1','source_role_2', 'source_role_3', 'source_role_4', 'source_role_5','source_role_6', 'destination_role_0', 'destination_role_1', 'destination_role_2', 'destination_role_3','destination_role_4', 'destination_role_5', 'destination_role_6']), axis=1, inplace=False), train["link"]
X_test,  y_test = test.drop(test.columns.difference(['source_pagerank',
       'source_indegree', 'source_outdegree', 'source_eigenvector',
       'source_avg_path_length', 'destination_pagerank',
       'destination_indegree', 'destination_outdegree',
       'destination_eigenvector', 'destination_avg_path_length', 'source_role_0', 'source_role_1','source_role_2', 'source_role_3', 'source_role_4', 'source_role_5','source_role_6', 'destination_role_0', 'destination_role_1', 'destination_role_2', 'destination_role_3','destination_role_4', 'destination_role_5', 'destination_role_6']), axis=1, inplace=False), test["link"]

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

# Predictions
y_proba = clf.predict_proba(X_test)[:, 1]
precisions, recall, threshold = precision_recall_curve(y_test, y_proba)

results["NF + R"] = {
    'threshold': threshold,
    'recall': recall[:-1]
}

# AUC

In [None]:
# The AUC will be rounded to three decimals
for key in results.keys():
    results[key]["area"] = round(np.trapz(results[key]["recall"], results[key]["threshold"]),3)

# Visualise the results

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=results["R"]["threshold"], y=results["R"]["recall"], mode='lines', name=f'Roles - AUC: {results["R"]["area"]}', line=dict(color="black")))
fig.add_trace(go.Scatter(x=results["NF"]["threshold"], y=results["NF"]["recall"], mode='lines', name=f'Node features  - AUC: {results["NF"]["area"]}', line=dict(color="green")))
fig.add_trace(go.Scatter(x=results["PF"]["threshold"], y=results["PF"]["recall"], mode='lines', name=f'Pair features  - AUC: {results["PF"]["area"]}', line=dict(color="blue")))

fig.add_trace(go.Scatter(x=results["PF + R"]["threshold"], y=results["PF + R"]["recall"], mode='lines', name=f'Pair features & roles  - AUC: {results["PF + R"]["area"]}', line=dict(color="orange")))
fig.add_trace(go.Scatter(x=results["NF + R"]["threshold"], y=results["NF + R"]["recall"], mode='lines', name=f'Node features & Roles - AUC: {results["NF + R"]["area"]}', line=dict(color="purple")))
fig.add_trace(go.Scatter(x=results["PF & NF"]["threshold"], y=results["PF & NF"]["recall"], mode='lines', name=f'Pair features & Node features - AUC: {results["PF & NF"]["area"]}', line=dict(color="red")))
fig.add_trace(go.Scatter(x=results["PF & NF & R"]["threshold"], y=results["PF & NF & R"]["recall"], mode='lines', name=f'Pair features & Node features & Roles - AUC: {results["PF & NF & R"]["area"]}', line=dict(color="pink")))


fig.update_layout(
    title='Recall score over recall threshold and AUC',
    xaxis_title='Threshold',
    width=900,
    yaxis_title='Recall',
    template='ggplot2',
    font_family="Times New Roman",
    font_size=12,
    legend=dict(orientation="h", y=-0.3))

fig.show()

In [None]:
test["y_pred"] = results["R"]["y_pred"]
test = test.sort_values(by='timestamp')

# Calculating cumulative recall
cumulative_true_positives = test['y_pred'].cumsum()
total_positives = len(test['y_pred'])
# Pandas series / static length
cumulative_recall = cumulative_true_positives / total_positives

# Creating the recall curve trace
trace_recall = go.Scatter(
    x=test['timestamp'],
    y=cumulative_recall,
    mode='lines+markers',
    name='Recall',
    text=cumulative_recall,
    textposition='top center'
)

fig = go.Figure()
fig.add_trace(trace_recall)
fig.update_layout(
    title='Recall curve - Roles',
    xaxis_title='Date',
    yaxis_title='Recall',
    height=500,
    width=800,
    font_family="Times New Roman",
    font_size=12,
    template="ggplot2",
    yaxis=dict(range=[0, 1])
)

fig.show()