In [1]:
import os

ROOT = os.getcwd()

In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [3]:
import polars as pl
from typing import Dict

class Graph:
    
    def __init__(
        self,
        nodes: pl.DataFrame,
        edges: pl.DataFrame,
        map_id: Dict[int, int]    
    ) -> None:
        
        self.nodes = nodes
        self.edges = edges
        self.map_id = map_id

In [5]:
import torch
from torch_geometric.data import Data
from typing import Dict, List
import networkx as nx
import networkit as nk
import numpy as np
import polars as pl
import pandas as pd




class DataNetWork:
    
    def __init__(
        self, 
        df_features: pl.DataFrame, 
        df_edges: pl.DataFrame, 
        df_classes: pl.DataFrame, 
        train_mask: np.array, 
        val_mask: np.array, 
        test_mask: np.array, 
        directed: bool = False
    ):
        
        self.df_features = df_features
        self.df_edges = df_edges
        self.df_classes = df_classes
        self.directed = directed
        
        self.graph: Graph = self._set_up_network_info()

        self.fraud_dict = dict(
            zip(
                pl.from_pandas(df_features["transid"].to_pandas().map(self.graph.map_id)),
                df_features["class"]
                )
            )
        
        self.train_mask = train_mask
        self.val_mask = val_mask
        self.test_mask = test_mask
        
        
        
    def _set_up_network_info(self) -> Graph:
        nodes = self.df_features.select(
            pl.col('transid')
        )
        
        map_id = {i:j for i,j in enumerate((nodes
                                            .to_series()
                                            .to_list()))} 
        
        edges = self.df_edges.select(
            pl.col('current_transid'),
            pl.col('next_transid')
        )

        if not self.directed:
            map_id = {j:i for i,j in enumerate((nodes
                                            .to_series()
                                            .to_list()))} 
            
            # nodes = nodes.with_columns(
            #     pl.col('transid').map_dict(map_id).cast(pl.Int64)
            # )
            
            nodes = nodes.to_pandas()
            nodes['transid'] = nodes['transid'].map(map_id).astype(np.int64)
            nodes = pl.from_pandas(nodes)
            
            # edges = edges.with_columns(
            #     pl.col('current_transid').map_dict(map_id).cast(pl.Int64),
            #     pl.col('next_transid').map_dict(map_id).cast(pl.Int64)
            # )
            
            edges = edges.to_pandas()
            
            edges_direct = edges[['current_transid', 'next_transid']]
            edges_reverse = edges_direct[['next_transid', 'current_transid']]
            edges_reverse.columns = ['current_transid', 'next_transid']
            
            edges = pd.concat([edges_direct, edges_reverse], axis=0)
            
            edges['current_transid'] = edges['current_transid'].map(map_id).astype(np.int64)
            edges['next_transid'] = edges['next_transid'].map(map_id).astype(np.int64)
            edges = pl.from_pandas(edges)
            

        
        return Graph(
            nodes=nodes,
            edges=edges,
            map_id=map_id
        )
        
        
        
    def get_network_nx(self) -> nx.DiGraph:
        edges_zipped = zip(self.graph.edges['current_transid'], self.graph.edges['next_transid'])
        
        if self.directed:
            G_nx = nx.DiGraph()
        else: 
            G_nx = nx.Graph()
        
        G_nx.add_nodes_from(self.graph.nodes)
        G_nx.add_edges_from(edges_zipped)
        
        return G_nx     
            
            
            
    def get_network_nk(self) -> nx.DiGraph:
        edges_zipped = zip(self.graph.edges['current_transid'], self.graph.edges['next_transid'])
        
        G_nk = nk.Graph(len(self.graph.nodes), directed = self.directed)
        
        for u,v in edges_zipped:
            G_nk.addEdge(u,v)
            
        return G_nk 
        
        
        
    def get_network_torch(self) -> Data:
        labels = self.df_features['class']
        features = self.df_features.to_pandas().drop(columns=['transid', 'class'])
        
        x = torch.tensor(np.array(features.to_numpy(), dtype=float), dtype=torch.float)
        if x.size()[1] == 0:
            x = torch.ones(x.size()[0], 1)
        
        x = x[:, 1:94]
        y = torch.tensor(np.array(labels.to_numpy(), dtype=np.int64), dtype=torch.int64)
        
        # Reformat and convert to tensor
        edge_index = np.array(self.graph.edges.to_numpy()).T 
        edge_index = torch.tensor(edge_index, dtype=torch.long)
        
        #create weights tensor with same shape of edge_index
        weights = torch.tensor([1]* edge_index.shape[1] , dtype=torch.float) 
        
        # Create pyG dataset
        data = Data(x=x, y=y, edge_index=edge_index)

        if self.train_mask is not None:
            data.train_mask = torch.tensor(self.train_mask, dtype=torch.bool)
        if self.val_mask is not None:
            data.val_mask = torch.tensor(self.val_mask, dtype=torch.bool)
        if self.test_mask is not None:
            data.test_mask = torch.tensor(self.test_mask, dtype=torch.bool)
        
        return data 
    
    
    
    def get_features(
            self, 
            full=False
        ) -> pl.DataFrame:
        
        if full:
            X = self.df_features[self.df_features.columns[2: 167]]
        else:
            X = self.df_features[self.df_features.columns[2: 95]]
            
        return X
    
    
    
    def get_features_torch(
        self, 
        full=False
    ) -> torch.tensor:
        
        X = self.get_features(full)
        X = torch.tensor(X.to_numpy(), dtype=torch.float32)
        
        return(X)



    def get_train_test_split_intrinsic(
        self, 
        train_mask: np.array, 
        test_mask: np.array, 
        device: str = 'cpu'
    ) -> List[torch.tensor]:
        
        X: pl.DataFrame = self.get_features()
        y: pl.Series = self.df_features['class']

        X_train = X.filter(
            pl.Series(train_mask.tolist())
        )
        y_train = y.filter(
            pl.Series(train_mask.tolist())
        )

        X_test = X.filter(
            pl.Series(test_mask.tolist())
        )
        y_test = y.filter(
            pl.Series(test_mask.tolist())
        )

        X_train = torch.tensor(X_train.to_numpy(), dtype=torch.float32).to(device)
        y_train = torch.tensor(y_train.to_numpy(), dtype=torch.long).to(device)

        X_test = torch.tensor(X_test.to_numpy(), dtype=torch.float32).to(device)
        y_test = torch.tensor(y_test.to_numpy(), dtype=torch.long).to(device)

        return X_train, y_train, X_test, y_test



    def get_fraud_dict(self) -> Dict[int, int]:
        return self.fraud_dict
    
    
    
    def get_masks(self) -> List[np.array]:
        return self.train_mask, self.val_mask, self.test_mask

In [6]:
FIRST_FEAT_NAME = {
        'column_1': 'transid',
        'column_2': 'time_steps',
}

CONFIG_FILE = 'conf/development.yml'

In [7]:
class EllipticLoader:
    
    def __init__(
        self,
        path_features: str,
        path_edgelist: str,
        path_classes: str
    ) -> None:
        
        self.path_features = path_features
        self.path_edgelist = path_edgelist
        self.path_classes = path_classes
    
    

    def load(self) -> DataNetWork:
        
        feat_df = pl.read_csv(
            self.path_features, 
            has_header=False
        )
    
        second_feat_name = {f'column_{i}': f'feature_{i-2}' for i in range(3, feat_df.shape[1] + 1)}
        converted_feature_names = {**FIRST_FEAT_NAME, **second_feat_name}
        feat_df = feat_df.rename(converted_feature_names)

        edge_df = pl.read_csv(
            self.path_edgelist, 
            new_columns=['current_transid', 'next_transid']
        
        )
        class_df = pl.read_csv(
            self.path_classes,
            new_columns=['transid', 'class']
        )

        mapping = {'unknown': 2, '1': 1, '2': 0}
        mapper = pl.DataFrame({
            "class": list(mapping.keys()),
            "new_class": list(mapping.values())
        })
        class_df = class_df.join(mapper, on='class', how='left').drop('class').rename({'new_class': 'class'})
        feat_df = feat_df.join(class_df, on='transid', how='left')
        y = torch.from_numpy(class_df['class'].to_numpy())

        # Timestamp based split:
        time_step = torch.from_numpy(feat_df['time_steps'].to_numpy())
        train_mask = (time_step < 30) & (y != 2)
        val_mask = (time_step >= 30) & (time_step < 40) & (y != 2) 
        test_mask = (time_step >= 40) & (y != 2)

        network = DataNetWork(
            feat_df, 
            edge_df, 
            class_df,
            train_mask=train_mask, 
            val_mask=val_mask, 
            test_mask=test_mask
        )

        return network

In [8]:
e = EllipticLoader(
    path_classes='/Users/phamminhlong/Desktop/paper/data/elliptic_bitcoin_dataset/elliptic_txs_classes.csv',
    path_edgelist='/Users/phamminhlong/Desktop/paper/data/elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv',
    path_features='/Users/phamminhlong/Desktop/paper/data/elliptic_bitcoin_dataset/elliptic_txs_features.csv'
)

In [9]:
network = e.load()

In [10]:
data = network.get_network_torch()

In [13]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

In [None]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)

    def decode(self, z, edge_label_index):
        return (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)

    def decode_all(self, z):
        prob_adj = z @ z.t()
        return (prob_adj > 0).nonzero(as_tuple=False).t()


model = Net(93, 128, 64).to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

In [None]:
import torch.nn as nn

class DecoderLinear(nn.Module):
    def __init__(
            self, 
            embedding_dim: int, 
            output_dim: int=2
            ):
        super().__init__()
        self.layer_norm = nn.LayerNorm(embedding_dim)
        self.layer1 = nn.Linear(embedding_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(
        self, 
        embedding, 
        normalize=False
    ):
        if normalize:
            embedding = self.layer_norm(embedding)
        h = self.layer1(embedding)
        h = self.softmax(h)
        return h

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv


class GCN(nn.Module):
    def __init__(
            self,
            num_features: int, 
            hidden_dim: int, 
            embedding_dim: int, 
            output_dim: int= 1, 
            n_layers: int = 3, 
            dropout_rate: float = 0
            ):
        super().__init__()
        self.num_features = num_features
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.output_dim = output_dim
        self.dropout_rate = dropout_rate
        self.dropout = nn.Dropout(dropout_rate)
        self.gcn_hidden = nn.ModuleList()
        self.n_layers = n_layers

        if n_layers == 1:
            self.gcn1 = GCNConv(num_features, embedding_dim)
        else:
            self.gcn1 = GCNConv(num_features, hidden_dim)
            for _ in range(n_layers - 2): 
                self.gcn_hidden.append(GCNConv(hidden_dim, hidden_dim))
            self.gcn2 = GCNConv(hidden_dim, embedding_dim)
            
        self.out = DecoderLinear(self.embedding_dim, self.output_dim)
        
    def encode(
        self, 
        x: torch.Tensor, 
        edge_index: torch.Tensor
    ) -> torch.Tensor:
        
        h = self.gcn1(x, edge_index)
        h = F.relu(h)
        h = self.dropout(h)
        if self.n_layers > 1:
            for layer in self.gcn_hidden:
                h = layer(h, edge_index)
                h = F.relu(h)
                h = self.dropout(h)
            h = self.gcn2(h, edge_index)
        
        return h
    
    def node_classification(
        self, 
        z: torch.Tensor
    ) -> torch.Tensor:
        
        output = self.out(z)
        
        return output
        
        
    def link_prediction(
        self,
        z: torch.Tensor,
        edge_label_index: torch.Tensor
        ) -> torch.Tensor:

        output = (z[edge_label_index[0]] * z[edge_label_index[1]]).sum(dim=-1)
        
        return output
    
    def forward(self):
        pass
        


In [None]:
import torch
import torch.nn as nn
from typing import List
import torch.nn.functional as F


class AdaDWLoss(nn.Module):
    
    def __init__(
        self, 
        T: float
    ) -> None:
        self.T = T
    
    def forward(
        self, 
        loss_trains: List[torch.Tensor], 
        loss_vals: List[torch.Tensor]
    ) -> torch.Tensor:
        
        weights = 1 - loss_trains / loss_vals
        lambda_coeff = torch.exp(weights / self.T) / torch.sum(torch.exp(weights / self.T))
        
        loss_tasks = torch.dot(lambda_coeff, loss_trains)
        
        return loss_tasks

In [48]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class FocalLoss(nn.Module):
    def __init__(
        self, 
        gamma=0, 
        alpha=None, 
        size_average=True
    ) -> None:
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(
        self, 
        input: torch.Tensor, 
        target: torch.Tensor
    ) -> torch.Tensor:
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [None]:
split = T.RandomLinkSplit(
    num_val=0.05,
    num_test=0.1,
    is_undirected=True,
    add_negative_train_samples=False,
    neg_sampling_ratio=1.0,
)

In [12]:
train, val, test = split(data)

NameError: name 'split' is not defined

In [None]:
neg_edge_index = negative_sampling(
            edge_index=train.edge_index, num_nodes=train.num_nodes,
            num_neg_samples=train.edge_label_index.size(1), method='sparse')

edge_label_index = torch.cat(
    [train.edge_label_index, neg_edge_index],
    dim=-1,
)
edge_label = torch.cat([
    train.edge_label,
    train.edge_label.new_zeros(neg_edge_index.size(1))
], dim=0)

In [None]:
hidden_dim=217
embedding_dim=87
n_layers=3
n_features=93
output_dim=2
dropout_rate=0.057
heads=5

lr=0.0864
epochs=500
batch_size=128

In [None]:
gcn = GCN(
        num_features=n_features,
        hidden_dim=hidden_dim,
        embedding_dim=embedding_dim,
        output_dim=output_dim,
        n_layers=n_layers,
        dropout_rate=dropout_rate
)

In [None]:
z = gcn.encode(train.x, train.edge_index)
out = gcn.link_prediction(z, edge_label_index)

In [None]:
from torch_geometric.loader import NeighborLoader
from multiprocessing import Pool

loader = NeighborLoader(
                data=train, 
                num_neighbors=[-1]*n_layers, 
                input_nodes=train.train_mask, 
                batch_size=batch_size, 
                shuffle=True, 
                num_workers=Pool()._processes
            )

In [None]:
optimizer = torch.optim.SGD(gcn.parameters(), lr=lr)
adadw = AdaDWLoss(T=2)

In [None]:
device = 'cpu'

In [None]:
for epoch in range(0, epochs):

        
    gcn.train()
    optimizer.zero_grad()
    
    z = gcn.encode(train.x.to(device), train.edge_index.to(device))
    
    out_nc = gcn.node_classification(z)
    
    neg_edge_index = negative_sampling(
                edge_index=train.edge_index.to(device), 
                num_nodes=train.num_nodes,
                num_neg_samples=train.edge_label_index.size(1), method='sparse'
            )

    edge_label_index = torch.cat(
        [train.edge_label_index, neg_edge_index],
        dim=-1,
    ).to(device)
    edge_label = torch.cat([
        train.edge_label.squeeze(dim=0),
        train.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0).to(device)
    

    out_lp = gcn.link_prediction(z, edge_label_index).view(-1)
    
    break

torch.Size([398406])
torch.Size([398406])
torch.Size([203769, 87])
torch.Size([398406, 87])
torch.Size([398406, 87])


In [49]:
criterion = FocalLoss(gamma=0.5)

In [52]:
torch.randint(0, 1, [203769], dtype=torch.int64)

tensor([0, 0, 0,  ..., 0, 0, 0])

In [59]:
criterion(torch.rand([203769, 2])[:, 1].reshape(-1, 1), torch.randint(0, 1, [203769, 1], dtype=torch.int64))

tensor(0.)