In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [37]:
# Load dataset files
features_df = pd.read_csv("elliptic_txs_features.csv", header=None)
classes_df = pd.read_csv("elliptic_txs_classes.csv", header=None, names=["txId", "label"])
edges_df = pd.read_csv("elliptic_txs_edgelist.csv", header=None, names=["txId1", "txId2"])

In [40]:
# Rename columns for clarity
features_df.rename(columns={0: "txId"}, inplace=True)

# Convert 'txId' column in both DataFrames to the same data type (e.g., int)
features_df['txId'] = features_df['txId'].astype(int)

# Skip the first row (header) when converting 'txId' to int for classes_df
classes_df['txId'] = classes_df['txId'][1:].astype(int) # Skipping the first row

# Merge features with classes
df = features_df.merge(classes_df, on="txId")

In [41]:

# Filter out unknown transactions (label = 0)
df = df[df["label"] != "unknown"]

# Convert labels to binary (licit = 0, illicit = 1)
df["label"] = df["label"].map({"2": 0, "1": 1})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].map({"2": 0, "1": 1})


In [42]:

# Assign a sorted index to transaction IDs
df.sort_values(by="txId", inplace=True)
df.reset_index(drop=True, inplace=True)


In [43]:
# Remove edges where both transactions are unknown
edges_df = edges_df[edges_df["txId1"].isin(df["txId"]) & edges_df["txId2"].isin(df["txId"])]


In [44]:
# Drop txId column after sorting
X = df.drop(columns=["txId", "label"])
y = df["label"]


In [45]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [47]:
from tensorflow.keras.layers import Input # Import the Input layer explicitly

In [50]:
from tensorflow.keras.models import Model

In [51]:
inputs = Input(shape=(X_train.shape[1],))
x = Dense(128, activation='relu')(inputs)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(32, activation='relu', name='embedding_layer')(x)  # Extract embeddings from here
outputs = Dense(1, activation='sigmoid')(x)
mlp_model = Model(inputs=inputs, outputs=outputs)  # Now Model is defined

In [52]:
# Compile model
mlp_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [53]:
# Train model
mlp_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=1)

Epoch 1/20
[1m1165/1165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 4ms/step - accuracy: 0.9157 - loss: 0.2217 - val_accuracy: 0.9728 - val_loss: 0.0964
Epoch 2/20
[1m1165/1165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9710 - loss: 0.1016 - val_accuracy: 0.9735 - val_loss: 0.0849
Epoch 3/20
[1m1165/1165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9723 - loss: 0.0955 - val_accuracy: 0.9773 - val_loss: 0.0756
Epoch 4/20
[1m1165/1165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.9763 - loss: 0.0794 - val_accuracy: 0.9790 - val_loss: 0.0724
Epoch 5/20
[1m1165/1165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9770 - loss: 0.0755 - val_accuracy: 0.9781 - val_loss: 0.0699
Epoch 6/20
[1m1165/1165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.9792 - loss: 0.0686 - val_accuracy: 0.9799 - val_loss: 0.0692
Epoch 7/20
[1m1

<keras.src.callbacks.history.History at 0x7d2f6c217bd0>

In [54]:
# Extract embeddings
embedding_model = Model(inputs=mlp_model.input, outputs=mlp_model.get_layer('embedding_layer').output)
X_train_embeddings = embedding_model.predict(X_train)
X_test_embeddings = embedding_model.predict(X_test)


[1m1165/1165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
[1m292/292[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [55]:
# Train Decision Tree on extracted embeddings
dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42)
dt_model.fit(X_train_embeddings, y_train)


In [56]:
# Evaluate model
y_pred = dt_model.predict(X_test_embeddings)


In [58]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [59]:
# Compute and display evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [60]:
print("Model Performance Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Model Performance Metrics:
Accuracy: 0.9806
Precision: 0.9165
Recall: 0.8812
F1 Score: 0.8985


In [61]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8404
           1       0.92      0.88      0.90       909

    accuracy                           0.98      9313
   macro avg       0.95      0.94      0.94      9313
weighted avg       0.98      0.98      0.98      9313

