In [42]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch.nn import Linear

import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.data import HeteroData
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, average_precision_score

In [2]:
dataset = pd.read_csv('C:/Users/ruben/OneDrive/Desktop/Datasets/feature_engineering_dataset.csv')
dataset = dataset.drop(columns=['cc_num', 'merchant'], axis=1)

# Index, cc_user and merchant_num are unique IDs that will be used for the construction of the graph. Gender and is_fraud are binary categorical variables.
exclude_columns = ['index', 'gender', 'is_fraud', 'cc_user', 'merchant_num']
columns_to_normalize = [col for col in dataset.columns if col not in exclude_columns]

scaler = MinMaxScaler()
dataset[columns_to_normalize] = scaler.fit_transform(dataset[columns_to_normalize])

dataset.head()

Unnamed: 0,index,category,gender,city,state,job,is_fraud,cc_user,merchant_num,hour,day,month,weekday,age,distance_km,hours_diff_bet_trans,amt_log
0,1017,0.249912,0,0.627451,0.20358,0.206099,0,0.0,293.0,0.521739,0.0,0.0,0.166667,0.231707,0.838856,0.0,0.148169
1,2724,1.0,0,0.627451,0.20358,0.206099,0,0.0,43.0,0.347826,0.033333,0.0,0.333333,0.231707,0.725115,0.051075,0.343911
2,2726,1.0,0,0.627451,0.20358,0.206099,0,0.0,399.0,0.347826,0.033333,0.0,0.333333,0.231707,0.14313,0.0,0.388997
3,2882,0.795682,0,0.627451,0.20358,0.206099,0,0.0,126.0,0.521739,0.033333,0.0,0.333333,0.231707,0.573176,0.008065,0.301093
4,2907,0.906266,0,0.627451,0.20358,0.206099,0,0.0,41.0,0.565217,0.033333,0.0,0.333333,0.231707,0.487771,0.0,0.27614


In [25]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

device = torch.device('cpu')
print(device)

cpu


In [26]:
data = HeteroData()

user_mapping = {idx: i for i, idx in enumerate(dataset['cc_user'].unique())}
merchant_mapping = {idx: i for i, idx in enumerate(dataset['merchant_num'].unique())}
data['user'].num_nodes = len(user_mapping)
data['merchant'].num_nodes = len(merchant_mapping)

src = [user_mapping[idx] for idx in dataset['cc_user']]
dst = [merchant_mapping[idx] for idx in dataset['merchant_num']]
edge_index = torch.tensor([src, dst])

fraud = torch.from_numpy(dataset['is_fraud'].values).to(torch.long)

data['user', 'pays', 'merchant'].edge_index = edge_index
data['user', 'pays', 'merchant'].edge_label = fraud

print(data)

HeteroData(
  user={ num_nodes=983 },
  merchant={ num_nodes=693 },
  (user, pays, merchant)={
    edge_index=[2, 1296675],
    edge_label=[1296675],
  }
)


In [27]:
user_features_df = dataset.drop_duplicates(subset='cc_user').set_index('cc_user').loc[:, ['job', 'gender', 'age', 'state', 'city']]
merchant_features_df = dataset.drop_duplicates(subset='merchant_num').set_index('merchant_num').loc[:, ['category']]

user_features_mapped = torch.zeros((len(user_mapping), user_features_df.shape[1]), dtype=torch.float)
merchant_features_mapped = torch.zeros((len(merchant_mapping), merchant_features_df.shape[1]), dtype=torch.float)

for user, idx in user_mapping.items():
    user_features_mapped[idx] = torch.tensor(user_features_df.loc[user].values, dtype=torch.float)

for merchant, idx in merchant_mapping.items():
    merchant_features_mapped[idx] = torch.tensor(merchant_features_df.loc[merchant].values, dtype=torch.float)

In [28]:
data['user'].x = user_features_mapped
data['merchant'].x = merchant_features_mapped
del data['user'].num_nodes, data['merchant'].num_nodes

data = T.ToUndirected()(data)
del data['merchant', 'rev_pays', 'user'].edge_label  # Remove "reverse" label.
# data['user', 'pays', 'merchant'].edge_attr = torch.tensor(dataset[['hour', 'day', 'month', 'weekday', 'amt_log', 'distance_km', 'hours_diff_bet_trans']].values, dtype=torch.float)

# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'pays', 'merchant')],
    rev_edge_types=[('merchant', 'rev_pays', 'user')],
)(data)

data.to(device)
print(data)

HeteroData(
  user={ x=[983, 5] },
  merchant={ x=[693, 1] },
  (user, pays, merchant)={
    edge_index=[2, 1296675],
    edge_label=[1296675],
  },
  (merchant, rev_pays, user)={ edge_index=[2, 1296675] }
)


In [29]:
data.validate()

True

In [30]:
weight = torch.bincount(train_data['user', 'merchant'].edge_label)
weight = weight.max() / weight

In [31]:
def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [32]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

In [33]:
class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['merchant'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

In [34]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)


model = Model(hidden_channels=32).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [39]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'merchant'].edge_label_index)
    target = train_data['user', 'merchant'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'merchant'].edge_label_index)
    pred = pred.clamp(min=0, max=1)
    target = data['user', 'merchant'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse), pred, target

In [44]:
for epoch in range(1, 301):
    loss = train()
    train_rmse, train_preds, train_targets = test(train_data)
    val_rmse, val_preds, val_targets = test(val_data)
    test_rmse, test_preds, test_targets = test(test_data)
    
    val_auc = roc_auc_score(val_targets.cpu().numpy(), val_preds.cpu().numpy())
    test_auc = roc_auc_score(test_targets.cpu().numpy(), test_preds.cpu().numpy())
    
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}, Val AUC: {val_auc:.4f}, Test AUC: {test_auc:.4f}')

Epoch: 001, Loss: 0.4301, Train: 0.4989, Val: 0.4988, Test: 0.4993, Val AUC: 0.7216, Test AUC: 0.7004
Epoch: 002, Loss: 0.4294, Train: 0.4248, Val: 0.4248, Test: 0.4253, Val AUC: 0.7228, Test AUC: 0.7016
Epoch: 003, Loss: 0.4265, Train: 0.4590, Val: 0.4590, Test: 0.4594, Val AUC: 0.7228, Test AUC: 0.7012
Epoch: 004, Loss: 0.4244, Train: 0.4720, Val: 0.4720, Test: 0.4724, Val AUC: 0.7229, Test AUC: 0.7011
Epoch: 005, Loss: 0.4249, Train: 0.4183, Val: 0.4182, Test: 0.4187, Val AUC: 0.7237, Test AUC: 0.7021
Epoch: 006, Loss: 0.4267, Train: 0.4922, Val: 0.4921, Test: 0.4926, Val AUC: 0.7236, Test AUC: 0.7019
Epoch: 007, Loss: 0.4272, Train: 0.3964, Val: 0.3964, Test: 0.3969, Val AUC: 0.7248, Test AUC: 0.7029
Epoch: 008, Loss: 0.4308, Train: 0.5533, Val: 0.5533, Test: 0.5537, Val AUC: 0.7230, Test AUC: 0.7002
Epoch: 009, Loss: 0.4460, Train: 0.3291, Val: 0.3290, Test: 0.3297, Val AUC: 0.7251, Test AUC: 0.7031
Epoch: 010, Loss: 0.4607, Train: 0.5266, Val: 0.5265, Test: 0.5270, Val AUC: 0.723

KeyboardInterrupt: 