In [None]:
!pip install xgboost



In [None]:
import xgboost as xgb

In [None]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import sklearn

In [None]:
def run_experiment(model, x_train, y_train):
    # Compile the model.
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate),
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
    )
    # Create an early stopping callback.
    early_stopping = keras.callbacks.EarlyStopping(
        monitor="val_acc", patience=10, restore_best_weights=True
    )
    # Fit the model.
    history = model.fit(
        x=x_train,
        y=y_train,
        epochs=num_epochs,
        batch_size=batch_size,
        validation_split=0.15,
        callbacks=[early_stopping],
    )

    return history

In [None]:
def display_learning_curves(history):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    ax1.plot(history.history["loss"])
    ax1.plot(history.history["val_loss"])
    ax1.legend(["train", "test"], loc="upper right")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")

    ax2.plot(history.history["acc"])
    ax2.plot(history.history["val_acc"])
    ax2.legend(["train", "test"], loc="upper right")
    ax2.set_xlabel("Epochs")
    ax2.set_ylabel("Accuracy")
    plt.show()


In [None]:
def create_ffn(hidden_units, dropout_rate, name=None):
    fnn_layers = []

    for units in hidden_units:
        fnn_layers.append(layers.BatchNormalization())
        fnn_layers.append(layers.Dropout(dropout_rate))
        fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))

    return keras.Sequential(fnn_layers, name=name)

In [None]:

edges = pd.read_csv("elliptic_txs_edgelist.csv")
features = pd.read_csv("elliptic_txs_features.csv", header=None, on_bad_lines='warn')
# or
# features = pd.read_csv("elliptic_txs_features.csv", header=None, error_bad_lines=False)
classes = pd.read_csv("elliptic_txs_classes.csv")

Skipping line 620: expected 167 fields, saw 301
Skipping line 1240: expected 167 fields, saw 236
Skipping line 1859: expected 167 fields, saw 303
Skipping line 2789: expected 167 fields, saw 173
Skipping line 3098: expected 167 fields, saw 223
Skipping line 3717: expected 167 fields, saw 305

  features = pd.read_csv("elliptic_txs_features.csv", header=None, on_bad_lines='warn')
Skipping line 4645: expected 167 fields, saw 171
Skipping line 5265: expected 167 fields, saw 231
Skipping line 6193: expected 167 fields, saw 173
Skipping line 6502: expected 167 fields, saw 254
Skipping line 6810: expected 167 fields, saw 212
Skipping line 7119: expected 167 fields, saw 176
Skipping line 8047: expected 167 fields, saw 194

  features = pd.read_csv("elliptic_txs_features.csv", header=None, on_bad_lines='warn')
Skipping line 9594: expected 167 fields, saw 285
Skipping line 10522: expected 167 fields, saw 273
Skipping line 12070: expected 167 fields, saw 239

  features = pd.read_csv("elliptic_t

In [None]:

print(edges.shape)
print(features.shape)

(234355, 2)
(73884, 167)


In [None]:

tx_features = ["tx_feat_"+str(i) for i in range(2,95)]
agg_features = ["agg_feat_"+str(i) for i in range(1,73)]
features.columns = ["txId","time_step"] + tx_features + agg_features

#merge features and classes
features = pd.merge(features,classes,left_on="txId",right_on="txId",how='left')
features['class'] = features['class'].apply(lambda x: '0' if x == "unknown" else x)

In [None]:

# get the features of the known transactions
features= features[features['class'] != '0' ]

In [None]:
def check(name):
  if(name in unique):
    return name
  else :
    return -1

In [None]:

# remove from the edges DF all the entries that mentions the unknown transactions
unique = features['txId'].unique()
edges["txId1"] = edges["txId1"].apply(lambda name: check(name))
edges["txId2"] = edges["txId2"].apply(lambda name: check(name))

In [None]:

edges = edges[edges["txId1"] != -1 ]
edges = edges[edges["txId2"] != -1 ]

In [None]:

print(edges.shape)
print(features.shape)

(11064, 2)
(15172, 168)


In [None]:
class_values = sorted(features["class"].unique())
#we create a new index by sorting the tx_ids and assign to it a number
features_idx = {name: idx for idx, name in enumerate(sorted(features["txId"].unique()))}

# we then apply this new ids to all te data frames
# this helps a lot in computing the adjency matrix, having the ids as the index.
features["txId"] = features["txId"].apply(lambda name: features_idx[name])
edges["txId1"] = edges["txId1"].apply(lambda name: features_idx[name])
edges["txId2"] = edges["txId2"].apply(lambda name: features_idx[name])

In [None]:

# Create an edges array (sparse adjacency matrix) of shape [2, num_edges].
edges2 = edges[["txId1", "txId2"]].to_numpy().T
# Create an edge weights array of ones.
edge_weights = tf.ones(shape=edges2.shape[1])
# Create a node features array of shape [num_nodes, num_features].
# IN the next line we select the features to include in the graph
# Notice than only tx_features are present!
# Convert all columns in tx_features to numeric, coerce errors to NaN
for col in tx_features:
    features[col] = pd.to_numeric(features[col], errors='coerce')

# Replace NaN values with 0
features[tx_features] = features[tx_features].fillna(0)
node_features = tf.cast(
    features.sort_values("txId")[tx_features].to_numpy(), dtype=tf.dtypes.float32
)
# Create graph info tuple with node_features, edges, and edge_weights.
graph_info = (node_features, edges2, edge_weights)

print("Edges shape:", edges2.shape)
print("Nodes shape:", node_features.shape)
print("edge weights shape:", edge_weights.shape)

Edges shape: (2, 11064)
Nodes shape: (15172, 93)
edge weights shape: (11064,)


In [None]:
class GraphConvLayer(layers.Layer):
    def __init__(
        self,
        hidden_units,
        dropout_rate=0.2,
        aggregation_type="mean",
        combination_type="concat",
        normalize=False,
        *args,
        **kwargs,
    ):
        super(GraphConvLayer, self).__init__(*args, **kwargs)

        self.aggregation_type = aggregation_type
        self.combination_type = combination_type
        self.normalize = normalize

        self.ffn_prepare = create_ffn(hidden_units, dropout_rate)
        if self.combination_type == "gated":
            self.update_fn = layers.GRU(
                units=hidden_units,
                activation="tanh",
                recurrent_activation="sigmoid",
                dropout=dropout_rate,
                return_state=True,
                recurrent_dropout=dropout_rate,
            )
        else:
            self.update_fn = create_ffn(hidden_units, dropout_rate)

    def prepare(self, node_repesentations, weights=None):
        # node_repesentations shape is [num_edges, embedding_dim].
        messages = self.ffn_prepare(node_repesentations)
        if weights is not None:
            messages = messages * tf.expand_dims(weights, -1)
        return messages

    def aggregate(self, node_indices, neighbour_messages):
        # node_indices shape is [num_edges].
        # neighbour_messages shape: [num_edges, representation_dim].
        num_nodes = tf.math.reduce_max(node_indices) + 1
        if self.aggregation_type == "sum":
            aggregated_message = tf.math.unsorted_segment_sum(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        elif self.aggregation_type == "mean":
            aggregated_message = tf.math.unsorted_segment_mean(
                neighbour_messages, node_indices, num_segments=num_nodes
            )
        elif self.aggregation_type == "max":
            aggregated_message = tf.math.unsorted_segment_max(
                neighbour_messages, node_indices, num_segments=num_nodes
            )

        else:
            raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")

        return aggregated_message

    def update(self, node_repesentations, aggregated_messages): #Fixed indentation here
        # node_repesentations shape is [num_nodes, representation_dim].
        # aggregated_messages shape is [num_nodes, representation_dim].
        if self.combination_type == "gru":
            # Create a sequence of two elements for the GRU layer.
            h = tf.stack([node_repesentations, aggregated_messages], axis=1)
        elif self.combination_type == "concat":
            # Concatenate the node_repesentations and aggregated_messages.
            h = tf.concat([node_repesentations, aggregated_messages], axis=1)
        elif self.combination_type == "add":
            # Add node_repesentations and aggregated_messages.
            h = node_repesentations + aggregated_messages
        else:
            raise ValueError(f"Invalid combination type: {self.combination_type}.")

        # Apply the processing function.
        node_embeddings = self.update_fn(h)
        if self.combination_type == "gru":
            node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]

        if self.normalize:
            node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
        return node_embeddings

    def call(self, inputs):
        """Process the inputs to produce the node_embeddings.

        inputs: a tuple of three elements: node_repesentations, edges, edge_weights.
        Returns: node_embeddings of shape [num_nodes, representation_dim].
        """

        node_repesentations, edges, edge_weights = inputs
        # Get node_indices (source) and neighbour_indices (target) from edges.
        node_indices, neighbour_indices = edges[0], edges[1]
        # neighbour_repesentations shape is [num_edges, representation_dim].
        neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices)

        # Prepare the messages of the neighbours.
        neighbour_messages = self.prepare(neighbour_repesentations, edge_weights)
        # Aggregate the neighbour messages.
        aggregated_messages = self.aggregate(node_indices, neighbour_messages)
        # Update the node embedding with the neighbour messages.
        return self.update(node_repesentations, aggregated_messages)

In [None]:
class GNNNodeClassifier(tf.keras.Model):
    def __init__(
        self,
        graph_info,
        num_classes,
        hidden_units,
        aggregation_type="sum",
        combination_type="concat",
        dropout_rate=0.2,
        normalize=True,
        *args,
        **kwargs,
    ):
        super(GNNNodeClassifier, self).__init__(*args, **kwargs)

        # Unpack graph_info to three elements: node_features, edges, and edge_weight.
        node_features, edges, edge_weights = graph_info
        self.node_features = node_features
        self.edges = edges
        self.edge_weights = edge_weights
        # Set edge_weights to ones if not provided.
        if self.edge_weights is None:
            self.edge_weights = tf.ones(shape=edges.shape[1])
        # Scale edge_weights to sum to 1.
        self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights)

        # Create a process layer.
        self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess")
        # Create the first GraphConv layer.
        self.conv1 = GraphConvLayer(
            hidden_units,
            dropout_rate,
            aggregation_type,
            combination_type,
            normalize,
            name="graph_conv1",
        )
        # Create the second GraphConv layer.
        self.conv2 = GraphConvLayer(
            hidden_units,
            dropout_rate,
            aggregation_type,
            combination_type,
            normalize,
            name="graph_conv2",
        )
        # Create a postprocess layer.
        self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess")
        # Create a compute logits layer.
        self.compute_logits = layers.Dense(units=num_classes, name="logits")

    def call(self, input_node_indices):
        # Preprocess the node_features to produce node representations.
        x = self.preprocess(self.node_features)
        # Apply the first graph conv layer.
        x1 = self.conv1((x, self.edges, self.edge_weights))
        # Skip connection.
        x = x1 + x
        # Apply the second graph conv layer.
        x2 = self.conv2((x, self.edges, self.edge_weights))
        # Skip connection.
        x = x2 + x
        # Postprocess node embedding.
        x = self.postprocess(x)
        # Fetch node embeddings for the input node_indices.
        node_embeddings = tf.gather(x, input_node_indices)
        # Compute logits
        return self.compute_logits(node_embeddings)

    def get_node_embeddings(self):
        """Returns the node embeddings learned by the GCN."""
        x = self.preprocess(self.node_features)
        x1 = self.conv1((x, self.edges, self.edge_weights))
        x = x1 + x
        x2 = self.conv2((x, self.edges, self.edge_weights))
        x = x2 + x
        x = self.postprocess(x)
        return x #Fixed indentation here

In [None]:
G = nx.from_edgelist(edges[['txId1', 'txId2']].values)

In [None]:
# Calculate node degrees
node_degrees = dict(G.degree())

# Add node degrees as a feature to the features DataFrame
features['node_degree'] = features['txId'].map(node_degrees)


In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import sklearn
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split ,GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
# 2. GCN Architecture Modifications (Example: Increase hidden units)
gnn_model = GNNNodeClassifier(
    graph_info,
    num_classes=len(class_values),
    hidden_units=[128, 64],  # Increased hidden units
    aggregation_type='mean',
    combination_type='concat',
    dropout_rate=0.2,
    normalize=True
)

node_embeddings = gnn_model.get_node_embeddings().numpy()

# 3. Prepare Data for XGBoost
X = node_embeddings
y = features['class'].astype(int) - 1


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Hyperparameter Tuning for XGBoost (using GridSearchCV)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    # ... other XGBoost hyperparameters ...
}

# Initialize and train XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='multi:softmax', num_class=len(class_values))
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_xgb_model = grid_search.best_estimator_

In [None]:
# 6. Evaluate the best model
y_pred = best_xgb_model.predict(X_test)


In [None]:
# 7. Calculate and Print Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy_percentage = accuracy * 100

print(f"Accuracy: {accuracy:.4f}")
print(f"Accuracy Percentage: {accuracy_percentage:.2f}%")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9127
Accuracy Percentage: 91.27%
Precision: 0.8330
Recall: 0.9127
F1 Score: 0.8710


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
