# Setup

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import authentic_performance as ap
from scipy.stats import uniform, norm
import networkx as nx
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import torch, os
from sklearn.model_selection import train_test_split
import torch_geometric.utils as utils
from torch.utils.data import DataLoader
import torch_geometric as pyg
from torch_geometric.nn import GCNConv
import pickle

In [28]:
with open("data/synthetic-dataset/random-net-dumb-fraudster/test/0.0/G_0.pickle", "rb") as f:
        G = pickle.load(f)
N = G.number_of_nodes()
LEN_TRAIN_DATASET = len(list(os.walk("data/synthetic-dataset/random-net-no-fraudster/train"))[0][2])
LEN_TEST_DATASET = len(list(os.walk("data/synthetic-dataset/random-net-no-fraudster/test"))[0][2])
FRAUDSTER_INDEX = [x for x, y in nx.get_node_attributes(G, "fraud").items() if y][0]
LIST_FRAUD_PROBABILITIES = sorted([float(x) for x in list(os.walk(r"data/synthetic-dataset/random-net-dumb-fraudster/test"))[0][1]])
print(f"N = {N}\nLEN_TRAIN_DATASET = {LEN_TRAIN_DATASET}\nLEN_TEST_DATASET = {LEN_TEST_DATASET}\nFRAUDSTER_INDEX = {FRAUDSTER_INDEX}\nLIST_FRAUD_PROBABILITIES = {LIST_FRAUD_PROBABILITIES}")

N = 100
LEN_TRAIN_DATASET = 100
LEN_TEST_DATASET = 100
FRAUDSTER_INDEX = 50
LIST_FRAUD_PROBABILITIES = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]


In [4]:
l_ranking_performance = []

# 1 Random network

## 1.1 Ranking performance

In [5]:
# delta degrees

l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/random-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    ratings = ap.rate_deltaDegrees(G)
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"random",
                              "method":"delta degrees",
                              "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                              "kendall_tau_distance (std)":np.std(a_ranking_distances),
                              "spearman_r (mean)":np.mean(a_spearman_r),
                              "spearman_r (std)":np.std(a_spearman_r)})

Kendall tau distance 	 mean: 710.5, std:55.2
Spearman correlation 	 mean:0.897, std:0.015


In [6]:
# f_alpha_t

l_df = []
for i in range(LEN_TRAIN_DATASET):
    with open(f"data/synthetic-dataset/random-net-no-fraudster/train/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
        df = ap.convert_graphToDataFrame(G)
        l_df.append(df)
df = pd.concat(l_df)
X, y = torch.from_numpy(df[["wins","losses","draws"]].values).float(), torch.from_numpy(df["strength"].values).float()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = ap.WinLossDraw()
criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=128, shuffle=True)
l_param = []
l_train_loss = []
l_test_loss = []
for epoch in range(100):
    for i, (data, labels) in enumerate(train_loader):
        pred = model(data).flatten()
        loss_train = criterion(pred, labels)
        loss_train.backward()
        l_train_loss.append(loss_train.item())
        optimizer.step()
        optimizer.zero_grad()
    l_train_loss.append(loss_train.item())
    l_param.append((model.alpha.item(), model.t.item()))
    with torch.no_grad():
        pred = model(X_test).flatten()
        loss_test = criterion(pred, y_test)
        l_test_loss.append(loss_test.item())
        
l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/random-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    t_ratings = model(torch.from_numpy(ap.convert_graphToDataFrame(G)[["wins", "losses", "draws"]].values))
    ratings = t_ratings.detach().numpy()
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"random",
                              "method":"f_alpha_t",
                              "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                              "kendall_tau_distance (std)":np.std(a_ranking_distances),
                              "spearman_r (mean)":np.mean(a_spearman_r),
                              "spearman_r (std)":np.std(a_spearman_r)})

model_f_alpha_t = model  # save for later
del model

Kendall tau distance 	 mean: 737.6, std:81.7
Spearman correlation 	 mean:0.872, std:0.034


In [7]:
# MLP

class MLP(torch.nn.Module):
    def __init__(self, input_size, hidden_size) -> None:
        super().__init__()
        self.input_size = input_size
        self.hidden_size  = hidden_size
        self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(self.hidden_size, 1)
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x):
        hidden = self.fc1(x)
        relu = self.relu(hidden)
        output = self.fc2(relu)
        output = self.sigmoid(output)
        return output

model = MLP(3, 100)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epoch = 100

l_train_loss = []
l_test_loss = []
for epoch in range(n_epoch):
    model.train()
    for i, (data, labels) in enumerate(train_loader):
        pred = model(data).flatten()
        loss_train = criterion(pred, labels)
        loss_train.backward()
        l_train_loss.append(loss_train.item())
        optimizer.step()
        optimizer.zero_grad()
    model.eval()
    pred = model(X_test).flatten()
    loss_test = criterion(pred, y_test)
    l_test_loss.append(loss_test.item())
    
l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/random-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    df = ap.convert_graphToDataFrame(G)
    ratings = model(torch.from_numpy(df[["wins", "losses", "draws"]].values).float()).flatten().detach().numpy()
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"random",
                              "method":"MLP",
                              "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                              "kendall_tau_distance (std)":np.std(a_ranking_distances),
                              "spearman_r (mean)":np.mean(a_spearman_r),
                              "spearman_r (std)":np.std(a_spearman_r)})

model_mlp = model  # save for later
del model

Kendall tau distance 	 mean: 632.4, std:58.1
Spearman correlation 	 mean:0.913, std:0.016


In [8]:
# MLP neighborhood information

l_X, l_y = [], []
for i in range(LEN_TRAIN_DATASET):
    with open(f"data/synthetic-dataset/random-net-no-fraudster/train/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    adjacency = nx.adjacency_matrix(G).todense()
    adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense()
    wins = np.array(adjacency.T.sum(axis=1)).flatten()
    losses = np.array(adjacency.sum(axis=1)).flatten()
    draws = np.zeros_like(wins)
    neighbourhood_strength = np.array(np.dot(adjacency_undirected, adjacency.T).sum(axis=1)).flatten() / np.array(np.dot(adjacency_undirected, adjacency_undirected).sum(axis=1)).flatten()
    y = np.array(list(nx.get_node_attributes(G, "strength").values()))
    X = np.stack((wins, losses, draws, neighbourhood_strength), axis=1)
    l_X.append(X)
    l_y.append(y)
X = torch.tensor(np.concatenate(l_X), dtype=torch.float)
y = torch.tensor(np.concatenate(l_y), dtype=torch.float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=128, shuffle=True)

model = MLP(4, 100)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epoch = 100

l_train_loss = []
l_test_loss = []
for epoch in range(n_epoch):
    model.train()
    for i, (data, labels) in enumerate(train_loader):
        pred = model(data).flatten()
        loss_train = criterion(pred, labels)
        loss_train.backward()
        l_train_loss.append(loss_train.item())
        optimizer.step()
        optimizer.zero_grad()
    model.eval()
    pred = model(X_test).flatten()
    loss_test = criterion(pred, y_test)
    l_test_loss.append(loss_test.item())

# fig, ax = plt.subplots(1,2)
# plt.suptitle("Losses")
# ax[0].plot(l_train_loss, label="train")
# ax[0].set_title("Train")
# ax[1].plot(l_test_loss, label="test")
# ax[1].set_title("Test")

l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/random-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    adjacency = nx.adjacency_matrix(G).todense();
    adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
    wins = np.array(adjacency.T.sum(axis=1)).flatten()
    losses = np.array(adjacency.sum(axis=1)).flatten()
    draws = np.zeros_like(wins)
    neighbourhood_strength = np.array(np.dot(adjacency_undirected, adjacency.T).sum(axis=1)).flatten() / np.array(np.dot(adjacency_undirected, adjacency_undirected).sum(axis=1)).flatten()
    y = np.array(list(nx.get_node_attributes(G, "strength").values()))
    X = np.stack((wins, losses, draws, neighbourhood_strength), axis=1)
    
    # mlp with neighbourhood information
    ratings = model(torch.from_numpy(X).float()).flatten().detach().numpy()
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)


a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)

print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")
l_ranking_performance.append({"network":"random",
                                "method":"MLP_neighbourhood",
                                "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                                "kendall_tau_distance (std)":np.std(a_ranking_distances),
                                "spearman_r (mean)":np.mean(a_spearman_r),
                                "spearman_r (std)":np.std(a_spearman_r)})

model_mlp_neighbourhood = model  # save for later
del model

  adjacency = nx.adjacency_matrix(G).todense()
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense()
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.a

Kendall tau distance 	 mean: 483.7, std:45.2
Spearman correlation 	 mean:0.948, std:0.010


  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();


In [9]:
# PageRank
l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/random-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    pr = nx.pagerank(G)
    ratings = np.array(list(pr.values()))
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
    
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"random",
                                "method":"PageRank",
                                "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                                "kendall_tau_distance (std)":np.std(a_ranking_distances),
                                "spearman_r (mean)":np.mean(a_spearman_r),
                                "spearman_r (std)":np.std(a_spearman_r)})

Kendall tau distance 	 mean: 613.5, std:55.9
Spearman correlation 	 mean:0.916, std:0.015


In [10]:
# GCN

class GCN(torch.nn.Module):
    def __init__(self, in_features, hidden_features):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_features, hidden_features)
        # self.conv2 = GCNConv(hidden_features, hidden_features)
        self.conv_out = GCNConv(hidden_features, 1)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = torch.nn.ReLU()(self.conv1(x, edge_index))
        # x = torch.nn.ReLU()(self.conv2(x, edge_index))
        x = self.conv_out(x, edge_index)
        return torch.nn.Sigmoid()(x)

def convert_nxToPyGData(G):
    data = pyg.utils.convert.from_networkx(G)
    data.id = torch.tensor(list(G.nodes()))
    data.x = torch.permute(torch.concat((pyg.utils.degree(data.edge_index[0], num_nodes=data.num_nodes).view(1,-1),
                                        pyg.utils.degree(data.edge_index[1], num_nodes=data.num_nodes).view(1,-1))),(1,0)).float()
    data.y = data.strength.float()
    return data

def evaluate():
    loss = 0
    for data in dataloader_test:
        output = model(data).flatten()
        loss += torch.nn.MSELoss(reduction='sum')(output, data.y)
    return (loss / (len(dataloader_test.sampler)*N)).item()

dataset = []
for i in range(LEN_TRAIN_DATASET):
    with open(f"data/synthetic-dataset/random-net-no-fraudster/train/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    data = convert_nxToPyGData(G)
    dataset.append(data)
dataloader_train = pyg.loader.DataLoader(dataset[:int(len(dataset)*0.8)], batch_size=10, shuffle=True)
dataloader_test = pyg.loader.DataLoader(dataset[int(len(dataset)*0.8):], batch_size=10, shuffle=True)
model = GCN(in_features=data.x.shape[1], hidden_features=100)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 100

l_train_loss, l_test_loss = [], []
for epoch in range(n_epochs):
    for data in dataloader_train:
        optimizer.zero_grad()
        output = model(data).flatten()
        loss_train = criterion(output, data.y)
        loss_train.backward()
        optimizer.step()
        l_train_loss.append(loss_train.item())
    loss_test = evaluate()
    l_test_loss.append(loss_test)

l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/random-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    data = convert_nxToPyGData(G)
    ratings = model(data).flatten()[torch.argsort(data.id)].detach().numpy()  # reordering so that nodes are ordered by their id
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"random",
                                "method":"GCN",
                                "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                                "kendall_tau_distance (std)":np.std(a_ranking_distances),
                                "spearman_r (mean)":np.mean(a_spearman_r),
                                "spearman_r (std)":np.std(a_spearman_r)})

model_gcn = model  # save for later
del model

Kendall tau distance 	 mean: 808.3, std:89.9
Spearman correlation 	 mean:0.865, std:0.029


## 1.2 Fraud sensitivity

### 1.2.1 Dumb fraudster

### 1.2.2 Malicious fraudster

## 1.3 Fraud detection

### 1.3.1 Dumb fraudster

### 1.3.2 Malicious fraudster

# 2 Strength network

## 2.1 Ranking performance

In [12]:
# delta degrees

l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/strength-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    ratings = ap.rate_deltaDegrees(G)
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"strength",
                              "method":"delta degrees",
                              "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                              "kendall_tau_distance (std)":np.std(a_ranking_distances),
                              "spearman_r (mean)":np.mean(a_spearman_r),
                              "spearman_r (std)":np.std(a_spearman_r)})

Kendall tau distance 	 mean: 1784.8, std:92.3
Spearman correlation 	 mean:0.397, std:0.053


In [14]:
# f_alpha_t

l_df = []
for i in range(LEN_TRAIN_DATASET):
    with open(f"data/synthetic-dataset/strength-net-no-fraudster/train/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
        df = ap.convert_graphToDataFrame(G)
        l_df.append(df)
df = pd.concat(l_df)
X, y = torch.from_numpy(df[["wins","losses","draws"]].values).float(), torch.from_numpy(df["strength"].values).float()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = ap.WinLossDraw()
criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=128, shuffle=True)
l_param = []
l_train_loss = []
l_test_loss = []
for epoch in range(100):
    for i, (data, labels) in enumerate(train_loader):
        pred = model(data).flatten()
        loss_train = criterion(pred, labels)
        loss_train.backward()
        l_train_loss.append(loss_train.item())
        optimizer.step()
        optimizer.zero_grad()
    l_train_loss.append(loss_train.item())
    l_param.append((model.alpha.item(), model.t.item()))
    with torch.no_grad():
        pred = model(X_test).flatten()
        loss_test = criterion(pred, y_test)
        l_test_loss.append(loss_test.item())
        
l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/strength-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    t_ratings = model(torch.from_numpy(ap.convert_graphToDataFrame(G)[["wins", "losses", "draws"]].values))
    ratings = t_ratings.detach().numpy()
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"strength",
                              "method":"f_alpha_t",
                              "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                              "kendall_tau_distance (std)":np.std(a_ranking_distances),
                              "spearman_r (mean)":np.mean(a_spearman_r),
                              "spearman_r (std)":np.std(a_spearman_r)})

model_f_alpha_t = model  # save for later
del model

Kendall tau distance 	 mean: 1791.1, std:100.9
Spearman correlation 	 mean:0.383, std:0.057


In [15]:
# MLP

model = MLP(3, 100)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epoch = 100

l_train_loss = []
l_test_loss = []
for epoch in range(n_epoch):
    model.train()
    for i, (data, labels) in enumerate(train_loader):
        pred = model(data).flatten()
        loss_train = criterion(pred, labels)
        loss_train.backward()
        l_train_loss.append(loss_train.item())
        optimizer.step()
        optimizer.zero_grad()
    model.eval()
    pred = model(X_test).flatten()
    loss_test = criterion(pred, y_test)
    l_test_loss.append(loss_test.item())

l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/strength-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    df = ap.convert_graphToDataFrame(G)
    ratings = model(torch.from_numpy(df[["wins", "losses", "draws"]].values).float()).flatten().detach().numpy()
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"strength",
                              "method":"MLP",
                              "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                              "kendall_tau_distance (std)":np.std(a_ranking_distances),
                              "spearman_r (mean)":np.mean(a_spearman_r),
                              "spearman_r (std)":np.std(a_spearman_r)})

model_mlp = model  # save for later
del model

Kendall tau distance 	 mean: 1778.5, std:101.9
Spearman correlation 	 mean:0.399, std:0.057


In [16]:
# MLP neighborhood information

l_X, l_y = [], []
for i in range(LEN_TRAIN_DATASET):
    with open(f"data/synthetic-dataset/strength-net-no-fraudster/train/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    adjacency = nx.adjacency_matrix(G).todense()
    adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense()
    wins = np.array(adjacency.T.sum(axis=1)).flatten()
    losses = np.array(adjacency.sum(axis=1)).flatten()
    draws = np.zeros_like(wins)
    neighbourhood_strength = np.array(np.dot(adjacency_undirected, adjacency.T).sum(axis=1)).flatten() / np.array(np.dot(adjacency_undirected, adjacency_undirected).sum(axis=1)).flatten()
    y = np.array(list(nx.get_node_attributes(G, "strength").values()))
    X = np.stack((wins, losses, draws, neighbourhood_strength), axis=1)
    l_X.append(X)
    l_y.append(y)
X = torch.tensor(np.concatenate(l_X), dtype=torch.float)
y = torch.tensor(np.concatenate(l_y), dtype=torch.float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_loader = DataLoader(list(zip(X_train, y_train)), batch_size=128, shuffle=True)

model = MLP(4, 100)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epoch = 100

l_train_loss = []
l_test_loss = []
for epoch in range(n_epoch):
    model.train()
    for i, (data, labels) in enumerate(train_loader):
        pred = model(data).flatten()
        loss_train = criterion(pred, labels)
        loss_train.backward()
        l_train_loss.append(loss_train.item())
        optimizer.step()
        optimizer.zero_grad()
    model.eval()
    pred = model(X_test).flatten()
    loss_test = criterion(pred, y_test)
    l_test_loss.append(loss_test.item())

# fig, ax = plt.subplots(1,2)
# plt.suptitle("Losses")
# ax[0].plot(l_train_loss, label="train")
# ax[0].set_title("Train")
# ax[1].plot(l_test_loss, label="test")
# ax[1].set_title("Test")

l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/strength-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    adjacency = nx.adjacency_matrix(G).todense();
    adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
    wins = np.array(adjacency.T.sum(axis=1)).flatten()
    losses = np.array(adjacency.sum(axis=1)).flatten()
    draws = np.zeros_like(wins)
    neighbourhood_strength = np.array(np.dot(adjacency_undirected, adjacency.T).sum(axis=1)).flatten() / np.array(np.dot(adjacency_undirected, adjacency_undirected).sum(axis=1)).flatten()
    y = np.array(list(nx.get_node_attributes(G, "strength").values()))
    X = np.stack((wins, losses, draws, neighbourhood_strength), axis=1)
    
    # mlp with neighbourhood information
    ratings = model(torch.from_numpy(X).float()).flatten().detach().numpy()
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)


a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)

print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")
l_ranking_performance.append({"network":"strength",
                                "method":"MLP_neighbourhood",
                                "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                                "kendall_tau_distance (std)":np.std(a_ranking_distances),
                                "spearman_r (mean)":np.mean(a_spearman_r),
                                "spearman_r (std)":np.std(a_spearman_r)})

model_mlp_neighbourhood = model  # save for later
del model

  adjacency = nx.adjacency_matrix(G).todense()
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense()
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.a

Kendall tau distance 	 mean: 1136.3, std:168.8
Spearman correlation 	 mean:0.710, std:0.076


  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx.adjacency_matrix(G).todense();
  adjacency_undirected = nx.adjacency_matrix(G.to_undirected()).todense();
  adjacency = nx

In [17]:
# PageRank
l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/strength-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    pr = nx.pagerank(G)
    ratings = np.array(list(pr.values()))
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
    
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"strength",
                                "method":"PageRank",
                                "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                                "kendall_tau_distance (std)":np.std(a_ranking_distances),
                                "spearman_r (mean)":np.mean(a_spearman_r),
                                "spearman_r (std)":np.std(a_spearman_r)})

Kendall tau distance 	 mean: 870.0, std:147.4
Spearman correlation 	 mean:0.816, std:0.065


In [18]:
# GCN

dataset = []
for i in range(LEN_TRAIN_DATASET):
    with open(f"data/synthetic-dataset/strength-net-no-fraudster/train/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    data = convert_nxToPyGData(G)
    dataset.append(data)
dataloader_train = pyg.loader.DataLoader(dataset[:int(len(dataset)*0.8)], batch_size=10, shuffle=True)
dataloader_test = pyg.loader.DataLoader(dataset[int(len(dataset)*0.8):], batch_size=10, shuffle=True)
model = GCN(in_features=data.x.shape[1], hidden_features=100)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

n_epochs = 100

l_train_loss, l_test_loss = [], []
for epoch in range(n_epochs):
    for data in dataloader_train:
        optimizer.zero_grad()
        output = model(data).flatten()
        loss_train = criterion(output, data.y)
        loss_train.backward()
        optimizer.step()
        l_train_loss.append(loss_train.item())
    loss_test = evaluate()
    l_test_loss.append(loss_test)

l_ranking_distances = []
l_spearman_r = []
a_ranking_true = np.arange(N)
for i in range(LEN_TEST_DATASET):
    with open(f"data/synthetic-dataset/strength-net-no-fraudster/test/G_{i}.pickle", "rb") as f:
        G = pickle.load(f)
    data = convert_nxToPyGData(G)
    ratings = model(data).flatten()[torch.argsort(data.id)].detach().numpy()  # reordering so that nodes are ordered by their id
    a_rankings = ap.argsort_robust(ratings)
    kendall_tau_distance = ap.kendall_tauDistance(a_ranking_true, a_rankings)
    l_ranking_distances.append(kendall_tau_distance)
    a_strengths = pd.Series(nx.get_node_attributes(G, "strength")).sort_index().values
    spearman_r = scipy.stats.spearmanr(ratings, a_strengths)
    l_spearman_r.append(spearman_r.correlation)
a_ranking_distances = np.array(l_ranking_distances)
a_spearman_r = np.array(l_spearman_r)
print(f"Kendall tau distance \t mean: {np.mean(a_ranking_distances):.1f}, std:{np.std(a_ranking_distances):.1f}\nSpearman correlation \t mean:{np.mean(a_spearman_r):.3f}, std:{np.std(a_spearman_r):.3f}")

l_ranking_performance.append({"network":"strength",
                                "method":"GCN",
                                "kendall_tau_distance (mean)":np.mean(a_ranking_distances),
                                "kendall_tau_distance (std)":np.std(a_ranking_distances),
                                "spearman_r (mean)":np.mean(a_spearman_r),
                                "spearman_r (std)":np.std(a_spearman_r)})

model_gcn = model  # save for later
del model

Kendall tau distance 	 mean: 1320.6, std:257.9
Spearman correlation 	 mean:0.627, std:0.129


## 2.2 Fraud sensitivity

### 2.2.1 Dumb fraudster

### 2.2.2 Malicious fraudster

## 2.3 Fraud detection

### 2.3.1 Dumb fraudster

### 2.3.2 Malicious fraudster

# Staging

In [20]:
pd.DataFrame(l_ranking_performance).sort_values(by=["network", "kendall_tau_distance (mean)"])

Unnamed: 0,network,method,kendall_tau_distance (mean),kendall_tau_distance (std),spearman_r (mean),spearman_r (std)
3,random,MLP_neighbourhood,483.69,45.156106,0.948379,0.009724
4,random,PageRank,613.52,55.900891,0.916269,0.015411
2,random,MLP,632.42,58.057416,0.912927,0.016271
0,random,delta degrees,710.55,55.233391,0.897128,0.014895
1,random,f_alpha_t,737.59,81.748651,0.872419,0.034411
5,random,GCN,808.31,89.947173,0.864886,0.029142
10,strength,PageRank,870.05,147.364607,0.816121,0.064625
9,strength,MLP_neighbourhood,1136.29,168.81305,0.709632,0.075551
11,strength,GCN,1320.59,257.884668,0.627328,0.128885
8,strength,MLP,1778.54,101.867111,0.399265,0.056685


# <>

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]