Reference: https://github.com/FighterLYL/GraphNeuralNetwork

In [None]:
!git clone https://github.com/Qin-sx/sybil_demo_pipeline.git

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
os.chdir('/content/sybil_demo_pipeline')

In [None]:
!mkdir data
!mkdir ./saved_model

In [None]:
!unzip /content/drive/MyDrive/contest/sybil_address_prediction.zip -d /content/sybil_demo_pipeline/data/
!mkdir /content/sybil_demo_pipeline/data/raw_data
!mv /content/sybil_demo_pipeline/data/sybil_address_prediction/*  /content/sybil_demo_pipeline/data/raw_data/

In [None]:
!mkdir data/features

In [None]:
import os
import pandas as pd

def get_all_candidates(file_path):
    train_datafile = os.path.join(file_path, "train_dataset.parquet")
    test_datafile = os.path.join(file_path, "test_dataset.parquet")
    train_df = pd.read_parquet(train_datafile)
    test_df = pd.read_parquet(test_datafile)
    train_addresses = train_df['ADDRESS']
    test_addresses = test_df['ADDRESS']
    all_addresses = pd.concat([train_addresses, test_addresses], ignore_index=True)
    all_addresses_df = pd.DataFrame(all_addresses, columns=['ADDRESS'])
    return all_addresses_df

def get_transaction_partners(transactions_df, address_df, columns=['FROM_ADDRESS', 'TO_ADDRESS']):
    # Initialize a dictionary to store the transaction partners for each address
    address_partners = {address: set() for address in address_df['ADDRESS']}

    # Iterate over each row in the transactions_df
    for _, row in transactions_df.iterrows():
        from_address = row[columns[0]]
        to_address = row[columns[1]]

        # Add the to_address to the from_address's partner set and vice versa
        if from_address in address_partners:
            address_partners[from_address].add(to_address)
        if to_address in address_partners:
            address_partners[to_address].add(from_address)

    # Convert the dictionary to a DataFrame
    result_df = pd.DataFrame({
        'ADDRESS': list(address_partners.keys()),
        'PARTNERS': [list(partners) for partners in address_partners.values()]
    })

    return result_df

def save_unique_partners(transaction_partners_df, address_df, output_path):
    # Extract all partners
    all_partners = set()
    for partners in transaction_partners_df['PARTNERS']:
        if partners is not None:  # Check for NULL values
            all_partners.update(partners)

    # Remove addresses that are in address_df['ADDRESS']
    address_set = set(address_df['ADDRESS'])
    unique_partners = all_partners - address_set

    # Convert to DataFrame
    unique_partners_df = pd.DataFrame(list(unique_partners), columns=['ADDRESS'])

    # Remove rows with NULL values in ADDRESS column
    unique_partners_df.dropna(subset=['ADDRESS'], inplace=True)

    # Merge PARTNERS from transaction_partners_df based on ADDRESS
    merged_df = unique_partners_df.merge(transaction_partners_df[['ADDRESS', 'PARTNERS']], on='ADDRESS', how='left')

    # Save to parquet file
    output_file = os.path.join(output_path, "partners.parquet")
    merged_df.to_parquet(output_file, index=False)

def main(file_path, output_path):
    # Get all addresses
    address_df = get_all_candidates(file_path)

    # Read the transactions data
    transactions_datafile = os.path.join(file_path, "transactions.parquet")
    transactions_df = pd.read_parquet(transactions_datafile)

    # Get transaction partners
    transaction_partners_df = get_transaction_partners(transactions_df, address_df, columns=['FROM_ADDRESS', 'TO_ADDRESS'])

    # Save the transaction partners to a .parquet file
    output_file = os.path.join(output_path, "transaction_partners.parquet")
    transaction_partners_df.to_parquet(output_file, index=False)

    # Save unique partners to a separate .parquet file
    save_unique_partners(transaction_partners_df, address_df, output_path)

if __name__ == "__main__":
    file_path = "./data/raw_data/"
    output_path = "./data/features/"
    main(file_path, output_path)

In [None]:
!python feature_process_2.py

In [None]:
if __name__ == "__main__":
    file_path = "./data/features/transactions_feature_partner.parquet"
    df = pd.read_parquet(file_path)
    addresses = df['ADDRESS']
    addresses_df = pd.DataFrame(addresses, columns=['ADDRESS'])

    transactions_datafile = "./data/raw_data/transactions.parquet"
    transactions_df = pd.read_parquet(transactions_datafile)

    transaction_partners_df = get_transaction_partners(transactions_df, addresses_df, columns=['FROM_ADDRESS', 'TO_ADDRESS'])

    output_path = "./data/features/matched_partners.parquet"
    transaction_partners_df.to_parquet(output_path, index=False)

In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import itertools

def build_adjacency(adj_dict, address_count):
    """Create adjacency matrix from adjacency list"""
    edge_index = []
    # num_nodes = len(adj_dict)
    for src, dst in adj_dict.items():
        edge_index.extend([src, v] for v in dst)
        edge_index.extend([v, src] for v in dst)
    # Remove duplicate edges
    edge_index = list(k for k, _ in itertools.groupby(sorted(edge_index)))
    edge_index = np.asarray(edge_index)
    adjacency = sp.coo_matrix((np.ones(len(edge_index)),
                               (edge_index[:, 0], edge_index[:, 1])),
                shape=(address_count, address_count), dtype="float32")
    return adjacency

def create_adjacency_matrix(file_path):
    # Read transaction_partners.parquet file
    transaction_partners_df = pd.read_parquet(file_path)

    # Build adjacency list
    adj_dict = {}
    address_to_index = {address: idx for idx, address in enumerate(transaction_partners_df['ADDRESS'])}
    address_count = len(transaction_partners_df['ADDRESS'])

    # print("Address to Index Mapping:")
    # for address, idx in address_to_index.items():
    #     print(f"{address}: {idx}")

    for _, row in transaction_partners_df.iterrows():
        address = row['ADDRESS']
        partners = row['PARTNERS']

        adj_dict[address_to_index[address]] = []

        # Check if partners is None or NaN
        if partners is None:
            continue

        # Filter out missing partners
        valid_partners = [partner for partner in partners if partner in address_to_index]

        # Add valid partners to adjacency list
        adj_dict[address_to_index[address]].extend([address_to_index[partner] for partner in valid_partners])

    # Create adjacency matrix
    adjacency_matrix = build_adjacency(adj_dict, address_count)

    return adjacency_matrix

file_path = "./data/features/matched_partners.parquet"
adjacency_matrix = create_adjacency_matrix(file_path)

In [None]:
import itertools
import pickle
from collections import namedtuple

import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torch.optim as optim

def tensor_from_numpy(x, device):
    return torch.from_numpy(x).to(device)

def normalization(adjacency):
    """calculate L=D^-0.5 * (A+I) * D^-0.5"""
    adjacency += sp.eye(adjacency.shape[0])    # Increased self-connection
    degree = np.array(adjacency.sum(1))
    d_hat = sp.diags(np.power(degree, -0.5).flatten())
    return d_hat.dot(adjacency).dot(d_hat).tocoo()

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from sklearn.model_selection import train_test_split

feature_path = "./data/features/transactions_feature_partner.parquet"
feature_df = pd.read_parquet(feature_path)
feature = feature_df.drop(['ADDRESS'], axis=1)

label_path = "./data/raw_data/train_dataset.parquet"
label_df = pd.read_parquet(label_path)

label = label_df['LABEL'].astype(int)

label_num = len(label_df)

test_path = "./data/raw_data/test_dataset.parquet"
test__df = pd.read_parquet(test_path)
test_num = len(test__df)

# Generate ID list
ids = list(range(label_num))
train_ids, valid_ids = train_test_split(ids, test_size=0.2, random_state=42)
test_ids = np.arange(label_num, label_num+test_num)

#Load data and convert to torch.Tensor
feature = feature.to_numpy()
# node_feature = feature / feature.sum(1, keepdims=True)  # Normalize data so that each row sums to 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
node_feature = scaler.fit_transform(feature)

tensor_x = tensor_from_numpy(node_feature, DEVICE).float()
tensor_y = tensor_from_numpy(label.to_numpy(), DEVICE)
tensor_train_mask = tensor_from_numpy(np.array(train_ids), DEVICE)
tensor_val_mask = tensor_from_numpy(np.array(valid_ids), DEVICE)
tensor_test_mask = tensor_from_numpy(np.array(test_ids), DEVICE)
normalize_adjacency = normalization(adjacency_matrix) 
# print("node_feature.shape",node_feature.shape)
num_nodes, input_dim = node_feature.shape
indices = torch.from_numpy(np.asarray([normalize_adjacency.row,
                                       normalize_adjacency.col]).astype('int64')).long()
values = torch.from_numpy(normalize_adjacency.data.astype(np.float32))
tensor_adjacency = torch.sparse_coo_tensor(indices, values,(num_nodes, num_nodes)).to(DEVICE)

In [None]:
class GraphConvolution(nn.Module):
    def __init__(self, input_dim, output_dim, use_bias=True):
        """Graph Convolution: L*X*\theta

        Args:
        ----------
            input_dim: int
                Dimension of input features for nodes
            output_dim: int
                Dimension of output features
            use_bias : bool, optional
                Whether to use bias
        """
        super(GraphConvolution, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.use_bias = use_bias
        self.weight = nn.Parameter(torch.Tensor(input_dim, output_dim))
        if self.use_bias:
            self.bias = nn.Parameter(torch.Tensor(output_dim))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        init.kaiming_uniform_(self.weight)
        if self.use_bias:
            init.zeros_(self.bias)

    def forward(self, adjacency, input_feature):
        """The adjacency matrix is a sparse matrix, so sparse matrix multiplication is used in computation

        Args:
        -------
            adjacency: torch.sparse.FloatTensor
                Adjacency matrix
            input_feature: torch.Tensor
                Input features
        """
        support = torch.mm(input_feature, self.weight)
        output = torch.sparse.mm(adjacency, support)
        if self.use_bias:
            output += self.bias
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
            + str(self.input_dim) + ' -> ' \
            + str(self.output_dim) + ')'



In [None]:
import torch.nn as nn
import torch.nn.functional as F

class GcnNet(nn.Module):
    """
    Define a model containing four layers of GraphConvolution
    """
    def __init__(self, input_dim, dropout_rate=0.1):
        super(GcnNet, self).__init__()
        self.gcn1 = GraphConvolution(input_dim, 64)
        self.gcn2 = GraphConvolution(64, 32)       
        self.gcn3 = GraphConvolution(32, 16)       
        self.gcn4 = GraphConvolution(16, 2)        
        self.dropout = nn.Dropout(dropout_rate)    

    def forward(self, adjacency, feature):
        h = F.relu(self.gcn1(adjacency, feature))
        h = self.dropout(h)
        h = F.relu(self.gcn2(adjacency, h))
        h = self.dropout(h)
        h = F.relu(self.gcn3(adjacency, h))
        h = self.dropout(h)
        logits = self.gcn4(adjacency, h)
        return logits

In [None]:
LEARNING_RATE = 0.001
# LEARNING_RATE = 0.05
WEIGHT_DECAY = 5e-4
EPOCHS = 2000

# model definition：Model, Loss, Optimizer
model = GcnNet(input_dim).to(DEVICE)
criterion = nn.CrossEntropyLoss().to(DEVICE)

optimizer = optim.Adam(model.parameters(),
                       lr=LEARNING_RATE,
                       weight_decay=WEIGHT_DECAY)
# optimizer = optim.SGD(model.parameters(),
#             lr=LEARNING_RATE,
#             momentum=0.9,
#             weight_decay=WEIGHT_DECAY)

In [None]:
def test(mask):
    model.eval()
    with torch.no_grad():
        logits = model(tensor_adjacency, tensor_x)
        test_mask_logits = logits[mask]
        predict_y = test_mask_logits.max(1)[1]
        accuarcy = torch.eq(predict_y, tensor_y[mask]).float().mean()
    return accuarcy, test_mask_logits.cpu().numpy(), tensor_y[mask].cpu().numpy()

In [None]:
def train():
    loss_history = []
    val_acc_history = []
    model.train()
    train_y = tensor_y[tensor_train_mask]

    val_best = 0
    best_model_path = "best_model.pth"

    for epoch in range(EPOCHS):
        logits = model(tensor_adjacency, tensor_x)  # Forward propagation
        train_mask_logits = logits[tensor_train_mask]   # Only select training nodes for supervision
        loss = criterion(train_mask_logits, train_y)    # Calculate loss
        optimizer.zero_grad()
        loss.backward()     # Backward propagation to compute gradients
        optimizer.step()    # Use optimizer to update gradients
        train_acc, _, _ = test(tensor_train_mask)     # Calculate training accuracy
        val_acc, _, _ = test(tensor_val_mask)     # Calculate validation accuracy
        if val_acc.item() > val_best:
          val_best = val_acc.item()
          print("new val", val_best)
          torch.save(model.state_dict(), best_model_path)
        # Record loss and accuracy during training for plotting
        loss_history.append(loss.item())
        val_acc_history.append(val_acc.item())
        print("Epoch {:03d}: Loss {:.4f}, TrainAcc {:.4}, ValAcc {:.4f}".format(
            epoch, loss.item(), train_acc.item(), val_acc.item()))

    print("val_best", val_best)
    return loss_history, val_acc_history


In [None]:
loss, val_acc = train()


In [None]:
import torch
import pandas as pd
import os

# Define inference function
def inference():
    # Load the best model
    best_model_path = "best_model.pth"
    model.load_state_dict(torch.load(best_model_path))
    model.eval()

    # Perform inference on the test set
    with torch.no_grad():
        logits = model(tensor_adjacency, tensor_x)
        test_mask_logits = logits[tensor_test_mask]
        predict_y = test_mask_logits.max(1)[1]


    # Save prediction results to a CSV file
    test_addresses = feature_df['ADDRESS'].iloc[tensor_test_mask.cpu().numpy()]
    test_df = pd.DataFrame({
        'ADDRESS': test_addresses,
        'PRED': predict_y.cpu().numpy()
    })
    test_df.to_csv("pred.csv", index=False)

# Call the inference function
inference()

In [None]:
import os
os.chdir('/content')

In [None]:
!zip -r sybil_demo_pipeline.zip /content/sybil_demo_pipeline -x '/content/sybil_demo_pipeline/data/*' -x '/content/sybil_demo_pipeline/saved_model/*'