In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Load Data
# Replace with the actual file paths if they are in a specific folder.
source_target_embeddings = pd.read_csv("output/emb__Source__Target.out", delim_whitespace=True, header=None, skiprows=1)
source_weight_embeddings = pd.read_csv("output/emb__Source__Weight.out", delim_whitespace=True, header=None, skiprows=1)
weight_target_embeddings = pd.read_csv("output/emb__Target__Weight.out", delim_whitespace=True, header=None, skiprows=1)
interaction_data = pd.read_csv("data/witcher_processed.tsv", sep="\t")
book_labels = pd.read_csv("data/witcher_target.tsv", sep="\t")

# Step 1: Merge Data
# Extract column names for embeddings (assuming embeddings start from column 2)
source_target_embeddings.columns = ["Node"] + [f"Embed_{i}" for i in range(1, source_target_embeddings.shape[1])]
source_weight_embeddings.columns = ["Node"] + [f"Embed_{i}" for i in range(1, source_weight_embeddings.shape[1])]
weight_target_embeddings.columns = ["Node"] + [f"Embed_{i}" for i in range(1, weight_target_embeddings.shape[1])]

# Merge embeddings with interaction data
interaction_data = interaction_data.merge(source_target_embeddings, left_on="Source", right_on="Node", suffixes=("", "_Source"))
interaction_data = interaction_data.merge(source_target_embeddings, left_on="Target", right_on="Node", suffixes=("", "_Target"))
interaction_data = interaction_data.merge(book_labels, left_index=True, right_index=True)


In [3]:
interaction_data.drop(columns=["Node", "Embed_1"])

Unnamed: 0,Source,Target,Weight,Embed_2,Embed_3,Embed_4,Embed_5,Embed_6,Embed_7,Embed_8,...,Embed_57_Target,Embed_58_Target,Embed_59_Target,Embed_60_Target,Embed_61_Target,Embed_62_Target,Embed_63_Target,Embed_64_Target,Embed_65_Target,book
0,Velerad,Geralt,1,-0.046306,-0.032459,-0.164166,0.267487,-0.150255,0.138698,0.174296,...,-0.243139,0.057464,-0.033725,-0.093554,0.249473,0.007179,0.017604,0.116918,-0.008722,1
1,Velerad,Geralt,1,-0.046306,-0.032459,-0.164166,0.267487,-0.150255,0.138698,0.174296,...,0.022619,0.033041,-0.052114,-0.278882,0.039098,-0.142371,0.108573,0.091120,-0.171687,1
2,Velerad,Geralt,1,0.062838,-0.026746,-0.017451,-0.109576,0.028481,-0.071201,-0.094078,...,-0.243139,0.057464,-0.033725,-0.093554,0.249473,0.007179,0.017604,0.116918,-0.008722,1
3,Velerad,Geralt,1,0.062838,-0.026746,-0.017451,-0.109576,0.028481,-0.071201,-0.094078,...,0.022619,0.033041,-0.052114,-0.278882,0.039098,-0.142371,0.108573,0.091120,-0.171687,1
4,Foltest,Geralt,4,0.055646,-0.038783,-0.032035,-0.111643,-0.007959,-0.077998,-0.068830,...,-0.243139,0.057464,-0.033725,-0.093554,0.249473,0.007179,0.017604,0.116918,-0.008722,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,Henselt,King,2,0.065173,-0.033965,-0.033946,-0.123002,-0.007751,-0.081750,-0.071421,...,-0.241330,0.059972,-0.038299,-0.091706,0.258406,-0.004897,0.010340,0.108766,-0.013695,7
2596,Henselt,King,2,-0.073236,-0.034930,-0.181299,0.264487,-0.138133,0.121084,0.158849,...,0.024614,0.027329,-0.066351,-0.267194,0.055483,-0.131152,0.109692,0.083849,-0.174639,7
2597,Henselt,King,2,-0.073236,-0.034930,-0.181299,0.264487,-0.138133,0.121084,0.158849,...,-0.241330,0.059972,-0.038299,-0.091706,0.258406,-0.004897,0.010340,0.108766,-0.013695,7
2598,Philippa,King,3,0.070165,-0.033380,-0.041951,-0.121565,-0.013871,-0.081241,-0.062850,...,0.024614,0.027329,-0.066351,-0.267194,0.055483,-0.131152,0.109692,0.083849,-0.174639,7


In [4]:


# Step 2: Feature Engineering
# Combine source and target embeddings (e.g., concatenation)
source_emb_cols = [col for col in interaction_data.columns if "Embed_" in col and "_Source" in col]
target_emb_cols = [col for col in interaction_data.columns if "Embed_" in col and "_Target" in col]

interaction_data["Weight_Normalized"] = interaction_data["Weight"] / interaction_data["Weight"].max()

# Example: Concatenate source and target embeddings
interaction_data["Features"] = interaction_data[source_emb_cols + target_emb_cols].values.tolist()

# Step 3: Train/Test Split
X = np.stack(interaction_data["Features"])
y = interaction_data["book"]  # Target label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [5]:
X.shape

(2600, 65)

In [8]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
# Step 5: Train Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 6: Evaluate
y_pred = clf.predict(X_test)


In [12]:
f1 = f1_score(y_test, y_pred, average="weighted")
accuracy = accuracy_score(y_test, y_pred)

print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

F1 Score: 0.732935088938899
Accuracy: 0.7730769230769231


In [13]:
# print per book classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.00      0.00      0.00        53
           2       0.00      0.00      0.00        32
           3       0.38      1.00      0.55        60
           4       0.95      0.82      0.88        85
           5       1.00      0.94      0.97        90
           6       0.98      0.81      0.89        68
           7       0.91      1.00      0.95       132

    accuracy                           0.77       520
   macro avg       0.60      0.65      0.61       520
weighted avg       0.73      0.77      0.73       520



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
