In [3]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [4]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.loader import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✅ Using device: {device}")

✅ Using device: cuda


In [6]:
df = pd.read_csv('/kaggle/input/li-small-trans-processed/LI-Small_Trans.csv')

In [7]:
df.columns

Index(['Timestamp', 'From_Bank', 'From_Account', 'To_Bank', 'To_Account',
       'Amount_Received', 'Receiving_Currency', 'Amount_Paid',
       'Payment_Currency', 'Payment_Format', 'Is_Laundering',
       'Currency_Mismatch'],
      dtype='object')

In [8]:
df['from_node'] = df['From_Bank'].astype(str) + '_' + df['From_Account'].astype(str)
df['to_node'] = df['To_Bank'].astype(str) + '_' + df['To_Account'].astype(str)

print(df[['from_node', 'to_node', 'Amount_Paid', 'Is_Laundering']].head())

        from_node         to_node  Amount_Paid  Is_Laundering
0    11_8000ECA90    11_8000ECA90   3195403.00              0
1  3402_80021DAD0  3402_80021DAD0      1858.96              0
2    11_8000ECA90  1120_8006AA910    592571.00              0
3  3814_8006AD080  3814_8006AD080        12.32              0
4    20_8006AD530    20_8006AD530      2941.56              0


In [9]:
G = nx.DiGraph()

In [10]:
for _, row in df.iterrows():
    G.add_edge(row['from_node'], row['to_node'], weight=row['Amount_Paid'])
# print(f"Graph info: {nx.info(G)}")

In [11]:
# Add dummy features (degree, in/out degree)
for node in G.nodes():
    G.nodes[node]['feature'] = [
        G.in_degree(node),
        G.out_degree(node),
        G.degree(node)
    ]

# Create label dictionary (optional)
labels = {}
for _, row in df.iterrows():
    labels[row['from_node']] = row['Is_Laundering']
    labels[row['to_node']] = row['Is_Laundering']

nx.set_node_attributes(G, labels, 'Is_Laundering')

In [18]:
from torch_geometric.utils import from_networkx

# Convert graph
data = from_networkx(G)

# Fix: Stack node features properly
data.x = torch.stack([
    f if isinstance(f, torch.Tensor) else torch.tensor(f, dtype=torch.float)
    for f in data.feature
])

# Fix: Ensure labels are all valid (replace None with 0)
data.y = torch.tensor(
    [l if l is not None else 0 for l in data.label],
    dtype=torch.long
)

# Move to GPU
data = data.to(device)


AttributeError: 'GlobalStorage' object has no attribute 'label'

In [None]:
# Cell 5: Define GCN Model
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = torch.nn.Dropout(p=0.3)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return x


In [None]:
# Cell 6: Train/Test Split
from sklearn.model_selection import train_test_split

num_nodes = data.num_nodes
idx = np.arange(num_nodes)
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=42)

train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[train_idx] = True
test_mask[test_idx] = True

data.train_mask = train_mask.to(device)
data.test_mask = test_mask.to(device)


In [19]:
# Cell 7: Train the Model
model = GCN(in_channels=data.num_node_features, hidden_channels=16, out_channels=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

for epoch in range(1, 101):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        pred = out.argmax(dim=1)
        correct = pred[data.test_mask] == data.y[data.test_mask]
        acc = int(correct.sum()) / int(data.test_mask.sum())
        print(f"Epoch {epoch:03d} | Loss: {loss:.4f} | Test Acc: {acc:.4f}")


NameError: name 'GCN' is not defined

In [16]:
# Cell 8: Visualize Suspicious Nodes
model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index)
    pred = out.argmax(dim=1)

df_pred = pd.DataFrame({
    'node': list(G.nodes),
    'predicted_label': pred.cpu().numpy(),
    'true_label': data.y.cpu().numpy()
})

suspicious = df_pred[df_pred['predicted_label'] == 1]
print(f"🚨 Suspicious nodes:\n{suspicious.head()}")

# Optional: Save to file
suspicious.to_csv("data/processed/suspicious_nodes.csv", index=False)


NameError: name 'model' is not defined