In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    ConfusionMatrixDisplay,
)
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [None]:
def get_graph():
    G = nx.read_edgelist("assets/slavko.net", nodetype=int)
    return G

In [None]:
fb_train = pd.read_csv("assets/facebook_train.csv")
fb_test = pd.read_csv("assets/facebook_test.csv")

In [None]:
G = get_graph()
dc = nx.degree_centrality(G)
cc = nx.closeness_centrality(G)
bc = nx.betweenness_centrality(G)
pr = nx.pagerank(G, alpha=0.9)
hub, authority = nx.hits(G, max_iter=200)

In [None]:
# add new features to training and test set
fb_train["deg_c"] = fb_train["node"].map(dc)
fb_train["closeness_c"] = fb_train["node"].map(cc)
fb_train["between_c"] = fb_train["node"].map(bc)
fb_train["page_rank"] = fb_train["node"].map(pr)
fb_train["hub"] = fb_train["node"].map(hub)
fb_train["authority"] = fb_train["node"].map(authority)

fb_test["deg_c"] = fb_test["node"].map(dc)
fb_test["closeness_c"] = fb_test["node"].map(cc)
fb_test["between_c"] = fb_test["node"].map(bc)
fb_test["page_rank"] = fb_test["node"].map(pr)
fb_test["hub"] = fb_test["node"].map(hub)
fb_test["authority"] = fb_test["node"].map(authority)

# split datasets
X_train = fb_train.loc[:, fb_train.columns != "label"]
y_train = fb_train.loc[:, fb_train.columns == "label"]

X_test = fb_test.loc[:, fb_train.columns != "label"]
y_test = fb_test.loc[:, fb_train.columns == "label"]

In [None]:
param_dist = {"n_estimators": randint(50, 500), "max_depth": randint(1, 20)}

# Create a random forest classifier
rf = RandomForestClassifier()

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=5, cv=5)

# Fit the random search object to the data
rand_search.fit(X_train, y_train)

# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print("Best hyperparameters:", rand_search.best_params_)

In [None]:
y_train

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm).plot()

In [None]:
# Create a series containing feature importances from the model and feature names from the training data
feature_importances = pd.Series(
    best_rf.feature_importances_, index=X_train.columns
).sort_values(ascending=False)

# Plot a simple bar chart
feature_importances.plot.bar()

In [None]:
# Your prediction result as a dictionary where
# <key> is a node in the test set and <value> is the predicted label
predict_labels = dict(zip(X_test.node, rf.predict(X_test)))

In [None]:
rf

In [None]:
# Export the first three decision trees from the forest

for i in range(3):
    tree = rf.estimators_[i]
    dot_data = export_graphviz(
        tree,
        feature_names=X_train.columns,
        filled=True,
        max_depth=2,
        impurity=False,
        proportion=True,
    )
    graph = graphviz.Source(dot_data)
    display(graph)