In [1]:
import os

ROOT = os.getcwd()

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [76]:
import os.path as osp

import torch
from sklearn.metrics import roc_auc_score

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling

if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

transform = T.Compose([
    T.NormalizeFeatures(),
    T.ToDevice(device),
    T.RandomLinkSplit(num_val=0.05, num_test=0.1, is_undirected=True,
                      add_negative_train_samples=False),
])
path = osp.join('./', '..', 'data', 'Planetoid')
dataset = Planetoid(path, name='Cora', transform=transform)
# After applying the `RandomLinkSplit` transform, the data is transformed from
# a data object to a list of tuples (train_data, val_data, test_data), with
# each element representing the corresponding split.
train_data, val_data, test_data = dataset[0]

In [3]:
import polars as pl
from typing import Dict

class Graph:
    
    def __init__(
        self,
        nodes: pl.DataFrame,
        edges: pl.DataFrame,
        map_id: Dict[int, int]    
    ) -> None:
        
        self.nodes = nodes
        self.edges = edges
        self.map_id = map_id

In [4]:
import torch
from torch_geometric.data import Data
from typing import Dict, List
import networkx as nx
import networkit as nk
import numpy as np
import polars as pl
import pandas as pd




class DataNetWork:
    
    def __init__(
        self, 
        df_features: pl.DataFrame, 
        df_edges: pl.DataFrame, 
        df_classes: pl.DataFrame, 
        train_mask: np.array, 
        val_mask: np.array, 
        test_mask: np.array, 
        directed: bool = False
    ):
        
        self.df_features = df_features
        self.df_edges = df_edges
        self.df_classes = df_classes
        self.directed = directed
        
        self.graph: Graph = self._set_up_network_info()

        self.fraud_dict = dict(
            zip(
                pl.from_pandas(df_features["transid"].to_pandas().map(self.graph.map_id)),
                df_features["class"]
                )
            )
        
        self.train_mask = train_mask
        self.val_mask = val_mask
        self.test_mask = test_mask
        
        
        
    def _set_up_network_info(self) -> Graph:
        nodes = self.df_features.select(
            pl.col('transid')
        )
        
        map_id = {i:j for i,j in enumerate((nodes
                                            .to_series()
                                            .to_list()))} 
        
        edges = self.df_edges.select(
            pl.col('current_transid'),
            pl.col('next_transid')
        )

        if not self.directed:
            map_id = {j:i for i,j in enumerate((nodes
                                            .to_series()
                                            .to_list()))} 
            
            # nodes = nodes.with_columns(
            #     pl.col('transid').map_dict(map_id).cast(pl.Int64)
            # )
            
            nodes = nodes.to_pandas()
            nodes['transid'] = nodes['transid'].map(map_id).astype(np.int64)
            nodes = pl.from_pandas(nodes)
            
            # edges = edges.with_columns(
            #     pl.col('current_transid').map_dict(map_id).cast(pl.Int64),
            #     pl.col('next_transid').map_dict(map_id).cast(pl.Int64)
            # )
            
            edges = edges.to_pandas()
            
            edges_direct = edges[['current_transid', 'next_transid']]
            edges_reverse = edges_direct[['next_transid', 'current_transid']]
            edges_reverse.columns = ['current_transid', 'next_transid']
            
            edges = pd.concat([edges_direct, edges_reverse], axis=0)
            
            edges['current_transid'] = edges['current_transid'].map(map_id).astype(np.int64)
            edges['next_transid'] = edges['next_transid'].map(map_id).astype(np.int64)
            edges = pl.from_pandas(edges)
            

        
        return Graph(
            nodes=nodes,
            edges=edges,
            map_id=map_id
        )
        
        
        
    def get_network_nx(self) -> nx.DiGraph:
        edges_zipped = zip(self.graph.edges['current_transid'], self.graph.edges['next_transid'])
        
        if self.directed:
            G_nx = nx.DiGraph()
        else: 
            G_nx = nx.Graph()
        
        G_nx.add_nodes_from(self.graph.nodes)
        G_nx.add_edges_from(edges_zipped)
        
        return G_nx     
            
            
            
    def get_network_nk(self) -> nx.DiGraph:
        edges_zipped = zip(self.graph.edges['current_transid'], self.graph.edges['next_transid'])
        
        G_nk = nk.Graph(len(self.graph.nodes), directed = self.directed)
        
        for u,v in edges_zipped:
            G_nk.addEdge(u,v)
            
        return G_nk 
        
        
        
    def get_network_torch(self) -> Data:
        labels = self.df_features['class']
        features = self.df_features.to_pandas().drop(columns=['transid', 'class'])
        
        x = torch.tensor(np.array(features.to_numpy(), dtype=float), dtype=torch.float)
        if x.size()[1] == 0:
            x = torch.ones(x.size()[0], 1)
        
        x = x[:, 1:94]
        y = torch.tensor(np.array(labels.to_numpy(), dtype=np.int64), dtype=torch.int64)
        
        # Reformat and convert to tensor
        edge_index = np.array(self.graph.edges.to_numpy()).T 
        edge_index = torch.tensor(edge_index, dtype=torch.long)
        
        #create weights tensor with same shape of edge_index
        weights = torch.tensor([1]* edge_index.shape[1] , dtype=torch.float) 
        
        # Create pyG dataset
        data = Data(x=x, y=y, edge_index=edge_index)

        if self.train_mask is not None:
            data.train_mask = torch.tensor(self.train_mask, dtype=torch.bool)
        if self.val_mask is not None:
            data.val_mask = torch.tensor(self.val_mask, dtype=torch.bool)
        if self.test_mask is not None:
            data.test_mask = torch.tensor(self.test_mask, dtype=torch.bool)
        
        return data 
    
    
    
    def get_features(
            self, 
            full=False
        ) -> pl.DataFrame:
        
        if full:
            X = self.df_features[self.df_features.columns[2: 167]]
        else:
            X = self.df_features[self.df_features.columns[2: 95]]
            
        return X
    
    
    
    def get_features_torch(
        self, 
        full=False
    ) -> torch.tensor:
        
        X = self.get_features(full)
        X = torch.tensor(X.to_numpy(), dtype=torch.float32)
        
        return(X)



    def get_train_test_split_intrinsic(
        self, 
        train_mask: np.array, 
        test_mask: np.array, 
        device: str = 'cpu'
    ) -> List[torch.tensor]:
        
        X: pl.DataFrame = self.get_features()
        y: pl.Series = self.df_features['class']

        X_train = X.filter(
            pl.Series(train_mask.tolist())
        )
        y_train = y.filter(
            pl.Series(train_mask.tolist())
        )

        X_test = X.filter(
            pl.Series(test_mask.tolist())
        )
        y_test = y.filter(
            pl.Series(test_mask.tolist())
        )

        X_train = torch.tensor(X_train.to_numpy(), dtype=torch.float32).to(device)
        y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long).to(device)

        X_test = torch.tensor(X_test.to_numpy(), dtype=torch.float32).to(device)
        y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long).to(device)

        return X_train, y_train, X_test, y_test



    def get_fraud_dict(self) -> Dict[int, int]:
        return self.fraud_dict
    
    
    
    def get_masks(self) -> List[np.array]:
        return self.train_mask, self.val_mask, self.test_mask

In [5]:
FIRST_FEAT_NAME = {
        'column_1': 'transid',
        'column_2': 'time_steps',
}

CONFIG_FILE = 'conf/development.yml'

In [6]:
class EllipticLoader:
    
    def __init__(
        self,
        path_features: str,
        path_edgelist: str,
        path_classes: str
    ) -> None:
        
        self.path_features = path_features
        self.path_edgelist = path_edgelist
        self.path_classes = path_classes
    
    

    def load(self) -> DataNetWork:
        
        feat_df = pl.read_csv(
            self.path_features, 
            has_header=False
        )
    
        second_feat_name = {f'column_{i}': f'feature_{i-2}' for i in range(3, feat_df.shape[1] + 1)}
        converted_feature_names = {**FIRST_FEAT_NAME, **second_feat_name}
        feat_df = feat_df.rename(converted_feature_names)

        edge_df = pl.read_csv(
            self.path_edgelist, 
            new_columns=['current_transid', 'next_transid']
        
        )
        class_df = pl.read_csv(
            self.path_classes,
            new_columns=['transid', 'class']
        )

        mapping = {'unknown': 2, '1': 1, '2': 0}
        mapper = pl.DataFrame({
            "class": list(mapping.keys()),
            "new_class": list(mapping.values())
        })
        class_df = class_df.join(mapper, on='class', how='left').drop('class').rename({'new_class': 'class'})
        feat_df = feat_df.join(class_df, on='transid', how='left')
        y = torch.from_numpy(class_df['class'].to_numpy())

        # Timestamp based split:
        time_step = torch.from_numpy(feat_df['time_steps'].to_numpy())
        train_mask = (time_step < 30) & (y != 2)
        val_mask = (time_step >= 30) & (time_step < 40) & (y != 2) 
        test_mask = (time_step >= 40) & (y != 2)

        network = DataNetWork(
            feat_df, 
            edge_df, 
            class_df,
            train_mask=train_mask, 
            val_mask=val_mask, 
            test_mask=test_mask
        )

        return network

In [7]:
e = EllipticLoader(
    path_classes='/Users/phamminhlong/Desktop/paper/data/elliptic_bitcoin_dataset/elliptic_txs_classes.csv',
    path_edgelist='/Users/phamminhlong/Desktop/paper/data/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv',
    path_features='/Users/phamminhlong/Desktop/paper/data/elliptic_bitcoin_dataset/elliptic_txs_features.csv'
)

In [8]:
network = e.load()

In [9]:
data = network.get_network_torch()

In [10]:
import os.path as osp

import torch
from sklearn.metrics import roc_auc_score

import torch_geometric.transforms as T
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling

In [11]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [12]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()


model = Net(93, 128, 64).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

In [97]:
edges = data.edge_index

In [14]:
edges = network.df_edges
feats = network.df_features
label = network.df_classes

In [98]:
edges = pl.DataFrame(edges.detach().cpu().numpy(), schema=['current_transid', 'next_transid'])

In [99]:
edges = edges.rename({'current_transid': 'transid'})

In [102]:
edges.join(label, on='transid', how='left')

transid,next_transid,class
i64,i64,i64
0,1,
2,3,
4,5,
6,7,
8,9,
…,…,…
201430,203602,
203099,203603,
202042,201921,
201368,201480,


In [95]:
edges = edges.rename({'current_transid': 'transid'})
edges = edges.join(label, on='transid', how='left')
# edges = edges.rename({'transid': 'current_transid', 'class': 'current_class', 'next_transid': 'transid'})
# edges = edges.join(label, on='transid', how='left')
# edges = edges.rename({'transid': 'next_transid', 'class': 'next_class'})
edges

transid,next_transid,class
i64,i64,i64
0,1,
2,3,
4,5,
6,7,
8,9,
…,…,…
201430,203602,
203099,203603,
202042,201921,
201368,201480,


In [51]:
edge_label = edges.with_columns(
    pl.when(pl.col('current_class') == pl.col('next_class'))
    .then(1)
    .otherwise(0)
    .alias('edge_label')
).drop(
    ['current_class', 'next_class']
)

In [52]:
edge_index_label = torch.from_numpy(edge_label.select(['current_transid', 'next_transid']).to_numpy()).reshape(2, -1)
edge_label = torch.from_numpy(edge_label.select(['edge_label']).to_numpy()).t()

In [18]:
data.x.shape

torch.Size([203769, 93])

In [56]:
neg_edge_index = negative_sampling(
    edge_index=data.edge_index, num_nodes=203769,
    num_neg_samples=edge_index_label.size(1), method='sparse'
)

In [57]:
neg_edge_index

tensor([[192583, 107956,  85579,  ..., 140018,  26402,  95249],
        [ 26428,  12138, 177168,  ..., 175080,  71810, 172442]])

In [21]:
data.x.shape, data.edge_index.shape

(torch.Size([203769, 93]), torch.Size([2, 468710]))