Block 1: Mount Drive & Load Dataset

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load dataset using pandas
import pandas as pd

# Update the path if needed
csv_path = '/content/drive/MyDrive/TGAT Model/clean_combined_dataset.csv'
df = pd.read_csv(csv_path)

# Show first 5 rows
print(df.head())

# Show useful info about columns & data types
print(df.info())

Mounted at /content/drive
   Unnamed: 0.1                                        from  \
0      10159229  0x219c5355f7496c47e743f5a6d98527509ea42444   
1      10010990  0x21a1662d90d163f79f9e71fda42c60926e80699c   
2      10159517  0xcbe64fb9fdee1eb4172d2bc375c12ace497ac253   
3      13323550  0x007077061537f25eaf485a1e6fa4af64e883be98   
4      13323549  0x7a44dbe0d1823cd177a9b4c35899046190811fb3   

                                           to  amount     timestamp  \
0  0xaaaf91d9b90df800df4f55c205fd6989c977e73a     0.0  1.494145e+09   
1  0xaaaf91d9b90df800df4f55c205fd6989c977e73a     0.0  1.494196e+09   
2  0xaaaf91d9b90df800df4f55c205fd6989c977e73a     0.0  1.494145e+09   
3  0xf0f8b0b8dbb1124261fc8d778e2287e3fd2cf4f5     0.0  1.494185e+09   
4  0xf0f8b0b8dbb1124261fc8d778e2287e3fd2cf4f5     0.0  1.494184e+09   

   fromIsPhi  toIsPhi                 date  Unnamed: 0  
0          0        0  2017-05-07 00:00:00         NaN  
1          0        0  2017-05-07 00:00:00         NaN

Block 2: Clean Data & Split by Month

In [2]:
# Drop unwanted index columns
df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'], inplace=True, errors='ignore')

# Convert 'date' to datetime
df['date'] = pd.to_datetime(df['date'], format='mixed', errors='coerce')

# Drop rows with invalid dates
df.dropna(subset=['date'], inplace=True)

# Sort by timestamp for time consistency
df.sort_values(by='timestamp', inplace=True)
df.reset_index(drop=True, inplace=True)  # Optional but clean

# Create 'month' column for splitting
df['month'] = df['date'].dt.to_period("M")

# Split DataFrame into dictionary of monthly splits
graph_splits = dict(tuple(df.groupby('month')))

# Debug prints
print("Number of months found:", len(graph_splits))
print("First few months:", list(graph_splits.keys())[:5])
print("Example month data (first month):")
print(graph_splits[list(graph_splits.keys())[0]].head())

Number of months found: 23
First few months: [Period('2016-11', 'M'), Period('2017-03', 'M'), Period('2017-05', 'M'), Period('2017-06', 'M'), Period('2017-07', 'M')]
Example month data (first month):
                                         from  \
0  0x70faa28a6b8d6829a4b1e649d26ec9a2a39ba413   
1  0x9af3bf0b0a117d3fbfb37dfc7fa67f9a645488fc   

                                           to      amount     timestamp  \
0  0x9af3bf0b0a117d3fbfb37dfc7fa67f9a645488fc  173.511739  1.480521e+09   
1  0x03e9a232700c14fbc15b1187a734a7de2824ea97  347.346320  1.480533e+09   

   fromIsPhi  toIsPhi       date    month  
0          0        1 2016-11-30  2016-11  
1          1        0 2016-11-30  2016-11  


Block 3.1: Build Temporal Graph with Time & Edge Features

In [3]:
import networkx as nx

def build_temporal_graph(month_df):
    G = nx.DiGraph()
    for _, row in month_df.iterrows():
        src, dst = row['from'], row['to']
        G.add_edge(src, dst,
                   timestamp=row['timestamp'],
                   amount=row['amount'])

        # Node label (is_phishing)
        G.nodes[src]['is_phishing'] = max(G.nodes[src].get('is_phishing', 0), row['fromIsPhi'])
        G.nodes[dst]['is_phishing'] = max(G.nodes[dst].get('is_phishing', 0), row['toIsPhi'])
    return G


Block 4: Add Node Features

In [4]:
def add_basic_node_features(G):
    for node in G.nodes():
        in_deg = G.in_degree(node)
        out_deg = G.out_degree(node)
        total_sent = sum(G[u][v].get('amount', 0) for u, v in G.out_edges(node))
        total_recv = sum(G[u][v].get('amount', 0) for u, v in G.in_edges(node))
        G.nodes[node]['x'] = [in_deg, out_deg, total_sent, total_recv]


Block 5: Convert to PyG + Add Timestamp & Labels

In [5]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [6]:
from torch_geometric.utils import from_networkx
import torch

def graph_to_pyg(G):
    pyg_data = from_networkx(G, group_node_attrs=['x'], group_edge_attrs=['amount', 'timestamp'])
    pyg_data.y = torch.tensor([G.nodes[n].get('is_phishing', 0) for n in G.nodes()], dtype=torch.long)
    return pyg_data


Block 6: Time Encoding (TGAT style)

In [7]:
import torch
import math

def time_encoding(t_diff, d=16):
    # Relative time encoding (cosine-based)
    omega = torch.linspace(1, 1000, d)
    bias = torch.rand(d)
    return torch.cos(omega * t_diff.unsqueeze(-1) + bias)


Block 7: TGAT Layer

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv

class SimpleTGAT(nn.Module):
    def __init__(self, in_dim, hidden_dim, time_dim=16):
        super().__init__()
        self.time_dim = time_dim
        self.time_encoder = lambda t: time_encoding(t, d=self.time_dim)
        self.gat1 = GATConv(in_dim + time_dim, hidden_dim, heads=2)
        self.gat2 = GATConv(hidden_dim * 2, hidden_dim, heads=1)

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        timestamps = edge_attr[:, 1]  # shape: [num_edges]

        # Encode time differences (relative to mean time)
        t_ref = timestamps.mean()
        t_diff = timestamps - t_ref
        time_feat = self.time_encoder(t_diff)  # shape: [num_edges, time_dim]
        time_feat = time_feat.to(x.device)

        # === Match time features to nodes ===
        # For each edge: assign time_feat to the source node of that edge
        # This builds [num_nodes, time_dim] by aggregating edge-wise time features

        # Step 1: initialize all time encodings as zeros
        x_time = torch.zeros(x.size(0), self.time_dim).to(x.device)

        # Step 2: accumulate time_feat for each source node
        source_nodes = edge_index[0]
        counts = torch.zeros(x.size(0)).to(x.device)

        for i in range(edge_index.size(1)):
            node = source_nodes[i]
            x_time[node] += time_feat[i]
            counts[node] += 1

        # Step 3: avoid divide-by-zero, average
        counts = counts.masked_fill(counts == 0, 1)
        x_time = x_time / counts.unsqueeze(1)

        # Step 4: concat node features with temporal features
        x_combined = torch.cat([x, x_time], dim=1)

        # Apply GAT layers
        h = F.relu(self.gat1(x_combined, edge_index))
        h = self.gat2(h, edge_index)
        return h


Block 8: Classifier

In [9]:
class PhishingClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 2)
        )
    def forward(self, x):
        return self.fc(x)


Block 9: Training Loop

In [10]:
# Model instantiation
embedder = SimpleTGAT(in_dim=4, hidden_dim=16)
classifier = PhishingClassifier(16)

params = list(embedder.parameters()) + list(classifier.parameters())
optimizer = torch.optim.Adam(params, lr=0.01)
loss_fn = nn.CrossEntropyLoss()

# Build graph from first month
month_list = list(graph_splits.keys())
G = build_temporal_graph(graph_splits[month_list[0]])
add_basic_node_features(G)
data = graph_to_pyg(G)

for epoch in range(100):
    embedder.train()
    classifier.train()
    optimizer.zero_grad()

    h = embedder(data)
    out = classifier(h)
    loss = loss_fn(out, data.y)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        pred = out.argmax(dim=1)
        acc = (pred == data.y).float().mean()
        print(f"Epoch {epoch} | Loss: {loss.item():.4f} | Acc: {acc:.4f}")


Epoch 0 | Loss: 5.6325 | Acc: 0.3333
Epoch 10 | Loss: 0.6058 | Acc: 0.6667
Epoch 20 | Loss: 1.0568 | Acc: 0.6667
Epoch 30 | Loss: 0.4441 | Acc: 0.6667
Epoch 40 | Loss: 0.4763 | Acc: 0.6667
Epoch 50 | Loss: 0.4648 | Acc: 0.6667
Epoch 60 | Loss: 0.4632 | Acc: 0.6667
Epoch 70 | Loss: 0.4282 | Acc: 1.0000
Epoch 80 | Loss: 0.4832 | Acc: 0.6667
Epoch 90 | Loss: 0.4600 | Acc: 0.6667


Block 10: Evaluation

In [11]:
from sklearn.metrics import classification_report

embedder.eval()
classifier.eval()
with torch.no_grad():
    h = embedder(data)
    pred = classifier(h).argmax(dim=1)
print(classification_report(data.y.cpu(), pred.cpu(), target_names=["Normal", "Phishing"]))

# Assign X and y for later use (e.g., t-SNE, metrics, etc.)
X = h
y = data.y


              precision    recall  f1-score   support

      Normal       1.00      0.50      0.67         2
    Phishing       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



TGAT Model Pipeline (PDTGA)

In [12]:
# Graph with Temporal Edges ➝ Time-Aware TGAT Encoder ➝ Node Embeddings ➝ MLP Classifier ➝ Evaluation Report