# DNN
Our best result is generated by this script. The first line of each block briefly explain the code. 

### Install and import package

In [1]:
# !pip install pandas
# !pip install gdown

In [2]:
import numpy as np
import pandas as pd
import csv
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, random_split
import math

### Data

In [5]:
# Download processed data

!gdown 1vHDhZSPmhithLVRRzkNw0ak_kk7PhInu # V_trade
!gdown 145T8z3XXlsaISzWdrJgJVANGokG1XuFI # V_remit
!gdown 1AubrUmeNUgpgiOu4tay6Gwl8O3lBaokF # V_info
!gdown 1zZo9RLt3mMmJZxEETSY2g9ND31qkZIn0 # V_cred
!gdown 1uFCx21bqE3FnrdfvN_mwtw2-nvogTEex # V_cons
!gdown 1ZOXGT_rIdEGIliHGKEH3ha77ZlZyq1Gn # train_y
!gdown 1qjEwmi97OWdshSNdgQj2ccXnoM4UvT25 # V_trade_public
!gdown 1g8trBiC6OxuoTU94u_UMygVrA-fSASpB # V_remit_public
!gdown 14KTfY56Mz2xBXdP27GvGB2HVeb_4Ks4T # V_info_public
!gdown 1EaIWnjQxUl4KRgVCqYT7AB4PaSNvc_GL # V_cred_public
!gdown 1owf1urxHZywAxJfCgVXCpXEMQ6VZGhnO # V_cons_public

In [6]:
# Read data as csv

V_cons = pd.read_csv('V_cons.csv').iloc[:, 1:]
V_cred = pd.read_csv('V_cred.csv').iloc[:, 1:]
V_info = pd.read_csv('V_info.csv').iloc[:, 1:]
V_remit = pd.read_csv('V_remit.csv').iloc[:, 1:]
V_trade = pd.read_csv('V_trade.csv').iloc[:, 1:]
train_y = pd.read_csv('train_y.csv').iloc[:, 1:]

V_cons_public = pd.read_csv('V_cons_public.csv').iloc[:, 1:]
V_cred_public = pd.read_csv('V_cred_public.csv').iloc[:, 1:]
V_info_public = pd.read_csv('V_info_public.csv').iloc[:, 1:]
V_remit_public = pd.read_csv('V_remit_public.csv').iloc[:, 1:]
V_trade_public = pd.read_csv('V_trade_public.csv').iloc[:, 1:]

public_x_alert_date = pd.read_csv('public_x_alert_date.csv')
all_keys = pd.read_csv('all_keys.csv')

In [7]:
# concatenate dataframes to get the entire training/testing data
# some values are missing, fill them with 0

V_overall = pd.concat([V_info, V_cred, V_cons, V_remit, V_trade], axis=1).fillna(0)
V_overall_public = pd.concat([V_info_public, V_cred_public, V_cons_public, V_remit_public, V_trade_public], axis=1).fillna(0)

In [8]:
# verify the shape of dataframes

print(V_info.shape)
print(V_cred.shape)
print(V_cons.shape)
print(V_remit.shape)
print(V_trade.shape)
print(V_overall.shape)
print()
print(V_info_public.shape)
print(V_cred_public.shape)
print(V_cons_public.shape)
print(V_remit_public.shape)
print(V_trade_public.shape)
print(V_overall_public.shape)
print()
print(train_y.shape)
print()
print(public_x_alert_date.shape)
print(all_keys.shape)

(23906, 4)
(23906, 117)
(23906, 1965)
(23906, 1572)
(23906, 3537)
(23906, 7195)

(1845, 4)
(1845, 117)
(1845, 1965)
(1845, 1572)
(1845, 3537)
(1845, 7195)

(23906, 1)

(1845, 2)
(25751, 1)


### Settings
There are three choice of cost function. The best result is generated by "linear". 

In [9]:
# set parameters

n_epoch = 500  # number of epochs
batch = 128  # batch size
lr = 0.0000001  # learning rate
w = 1  # penalty weight for false negative
d = 99  # duplicate d times for SAR_flag == 1 (oversampling)

In [10]:
# define network

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.network = nn.Sequential( # 7195 -> 1
            nn.Linear(7195, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid() # last one must be sigmoid 
        )
                
    def forward(self, x):
        x = self.network(x)
        return x

In [11]:
# loss function (linear)

def loss_function(prob, ans):
    return (w * (1 - prob) * ans + (prob) * (1 - ans))

In [12]:
# loss function (quadratic)

# def loss_function(prob, ans):
#     # a * x**n
#     a = 2
#     n = 2
#     return (w * a * (1 - prob)**n * ans + a * (prob)**n * (1 - ans))

In [13]:
# loss function (log)

# prob = torch.minimum(prob, torch.full((prob.size(dim=0), 1), 0.9999999).to(device))
# prob = torch.maximum(prob, torch.full((prob.size(dim=0), 1), 0.0000001).to(device))
# return (w * torch.log(1 - prob) * ans + torch.log(prob) * (1 - ans))

### Main

In [14]:
# training process

def train(train_data, val_data, model, n_epoch, batch, lr, device):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    best_loss = 1000000
    for epoch in range(n_epoch):
        total_loss = 0
        model.train()
        idx = 0
        for data, ans in train_data:
            data, ans = data.to(device), ans.to(device)
            prob = model(data)
            loss = torch.sum(loss_function(prob, ans)) / batch
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += (loss.item() / len(train_data))
            print('[Epoch %d | %d/%d] loss: %.4f' % ((epoch+1), idx*batch, len(train_data) * batch, loss.item()), end='\r')
            idx += 1
        print("\n  Training  | Loss:%.4f " % total_loss)

        # validation
        model.eval()
        total_loss = 0
        idx = 0 
        with torch.no_grad():
            for data, ans in val_data:
                data, ans = data.to(device), ans.to(device)
                prob = model(data)
                loss = torch.sum(loss_function(prob, ans)) / batch
                total_loss += (loss.item() / len(val_data))
                idx += 1
            print(" Validation | Loss:%.4f " % total_loss)
            
        # save model
        if total_loss < best_loss:
            best_loss = total_loss
            print("saving model with loss %.4f...\n" % total_loss)
            torch.save(model.state_dict(), "%s" % "model.pth")
                
    return model

In [15]:
# Define the whole dataset
# Oversampling: we duplicate the samples with SAR_flag=1 for d times
# d=99 let the number of samples of flag=0 and flag=1 matches (SAR_rate = 0.5)

class TrainValDataset(Dataset):
    def __init__(self):
        # oversampling
        V_overall_list = V_overall.values.tolist()
        train_y_list = train_y.values.tolist()
        l = len(train_y_list)
        s = sum(sum(train_y_list,[]))
        print("Before oversampling: total=:", l, "flag=0:", l - s, "flag=1:", s, "SAR_rate=", s / l)        
        for i in range(l):
            if train_y_list[i][0] == 1:
                V_overall_list.extend([V_overall_list[i] for j in range(d)])
                train_y_list.extend([train_y_list[i] for j in range(d)])
        l = len(train_y_list)
        s = sum(sum(train_y_list,[]))
        print("After oversampling: total=:", l, "flag=0:", l - s, "flag=1:", s, "SAR_rate=", s / l)    
        
        self.X = torch.tensor(V_overall_list).to(torch.float32)
        self.Y = torch.tensor(train_y_list).to(torch.float32)
  
    def __len__(self):
        return self.X.size(0)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [16]:
# Define testing dataset

class TestDataset(Dataset):
    def __init__(self):
        self.X = torch.tensor(V_overall_public.values).to(torch.float32)

    def __len__(self):
        return self.X.size(0)

    def __getitem__(self, idx):
        return self.X[idx]

In [17]:
# split the whole dataset into training and validation set

dataset = TrainValDataset()
trainset, valset = random_split(dataset, [int(len(dataset) * 0.9), len(dataset) - int(len(dataset) * 0.9)])
train_dataloader = DataLoader(trainset, batch, True)
val_dataloader = DataLoader(valset, batch, True)

Before oversampling: total=: 23906 flag=0: 23672 flag=1: 234 SAR_rate= 0.009788337655818623
After oversampling: total=: 47072 flag=0: 23672 flag=1: 23400 SAR_rate= 0.4971108089734874


In [18]:
# start training! 

device = 'cuda:0'
model = Net().to(device) 
model = train(train_dataloader, val_dataloader, model, n_epoch, batch, lr, device)

[Epoch 1 | 42240/42368] loss: 0.5176
  Training  | Loss:0.5134 
 Validation | Loss:0.5140 
saving model with loss 0.5140...

[Epoch 2 | 42240/42368] loss: 0.4742
  Training  | Loss:0.5132 
 Validation | Loss:0.5138 
saving model with loss 0.5138...

[Epoch 3 | 42240/42368] loss: 0.5084
  Training  | Loss:0.5131 
 Validation | Loss:0.5137 
saving model with loss 0.5137...

[Epoch 4 | 42240/42368] loss: 0.5341
  Training  | Loss:0.5129 
 Validation | Loss:0.5135 
saving model with loss 0.5135...

[Epoch 5 | 42240/42368] loss: 0.5659
  Training  | Loss:0.5127 
 Validation | Loss:0.5133 
saving model with loss 0.5133...

[Epoch 6 | 42240/42368] loss: 0.3732
  Training  | Loss:0.5125 
 Validation | Loss:0.5131 
saving model with loss 0.5131...

[Epoch 7 | 42240/42368] loss: 0.4864
  Training  | Loss:0.5124 
 Validation | Loss:0.5129 
saving model with loss 0.5129...

[Epoch 8 | 42240/42368] loss: 0.5431
  Training  | Loss:0.5122 
 Validation | Loss:0.5127 
saving model with loss 0.5127...



[Epoch 66 | 42240/42368] loss: 0.3920
  Training  | Loss:0.4883 
 Validation | Loss:0.4862 
saving model with loss 0.4862...

[Epoch 67 | 42240/42368] loss: 0.4559
  Training  | Loss:0.4879 
 Validation | Loss:0.4857 
saving model with loss 0.4857...

[Epoch 68 | 42240/42368] loss: 0.4475
  Training  | Loss:0.4874 
 Validation | Loss:0.4850 
saving model with loss 0.4850...

[Epoch 69 | 42240/42368] loss: 0.5114
  Training  | Loss:0.4868 
 Validation | Loss:0.4841 
saving model with loss 0.4841...

[Epoch 70 | 42240/42368] loss: 0.4614
  Training  | Loss:0.4863 
 Validation | Loss:0.4834 
saving model with loss 0.4834...

[Epoch 71 | 42240/42368] loss: 0.5656
  Training  | Loss:0.4858 
 Validation | Loss:0.4827 
saving model with loss 0.4827...

[Epoch 72 | 42240/42368] loss: 0.4577
  Training  | Loss:0.4854 
 Validation | Loss:0.4822 
saving model with loss 0.4822...

[Epoch 73 | 42240/42368] loss: 0.4466
  Training  | Loss:0.4850 
 Validation | Loss:0.4818 
saving model with loss 0.4

[Epoch 131 | 42240/42368] loss: 0.4267
  Training  | Loss:0.4542 
 Validation | Loss:0.4490 
saving model with loss 0.4490...

[Epoch 132 | 42240/42368] loss: 0.4110
  Training  | Loss:0.4539 
 Validation | Loss:0.4487 
saving model with loss 0.4487...

[Epoch 133 | 42240/42368] loss: 0.4872
  Training  | Loss:0.4536 
 Validation | Loss:0.4484 
saving model with loss 0.4484...

[Epoch 134 | 42240/42368] loss: 0.4408
  Training  | Loss:0.4534 
 Validation | Loss:0.4482 
saving model with loss 0.4482...

[Epoch 135 | 42240/42368] loss: 0.4241
  Training  | Loss:0.4531 
 Validation | Loss:0.4480 
saving model with loss 0.4480...

[Epoch 136 | 42240/42368] loss: 0.4770
  Training  | Loss:0.4529 
 Validation | Loss:0.4478 
saving model with loss 0.4478...

[Epoch 137 | 42240/42368] loss: 0.5340
  Training  | Loss:0.4527 
 Validation | Loss:0.4476 
saving model with loss 0.4476...

[Epoch 138 | 42240/42368] loss: 0.4428
  Training  | Loss:0.4526 
 Validation | Loss:0.4474 
saving model with 

[Epoch 196 | 42240/42368] loss: 0.3794
  Training  | Loss:0.4418 
 Validation | Loss:0.4350 
saving model with loss 0.4350...

[Epoch 197 | 42240/42368] loss: 0.3973
  Training  | Loss:0.4406 
 Validation | Loss:0.4347 
saving model with loss 0.4347...

[Epoch 198 | 42240/42368] loss: 0.4387
  Training  | Loss:0.4402 
 Validation | Loss:0.4345 
saving model with loss 0.4345...

[Epoch 199 | 42240/42368] loss: 0.4748
  Training  | Loss:0.4400 
 Validation | Loss:0.4345 
saving model with loss 0.4345...

[Epoch 200 | 42240/42368] loss: 0.4366
  Training  | Loss:0.4399 
 Validation | Loss:0.4344 
saving model with loss 0.4344...



### Predict

In [19]:
# load model and predict testing data

best_model = model
best_model.load_state_dict(torch.load("model.pth"))
best_model = best_model.eval()

testset = TestDataset()
test_dataloader = DataLoader(testset, 1, False)
result = []
for x in test_dataloader:
    x = x.to(device)
    result.append(best_model(x).item())

In [20]:
# generate (key, probability) pairs

keys_to_predict = sorted(public_x_alert_date['alert_key'].values.tolist())
pairs = np.array(list(zip(keys_to_predict, result)))
sorted_pairs = np.flip(pairs[pairs[:, 1].argsort()], 0)

In [21]:
# generate output file 

example_keys = []
with open('example.csv', newline='') as example:
    rows = csv.reader(example)
    headers = next(rows)
    for row in rows:
        example_keys.append(int(row[0]))
        
with open('predict.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['alert_key','probability'])
    for row in sorted_pairs:
        writer.writerow([int(row[0]), row[1]])
    for key in example_keys:
        if key not in keys_to_predict:
            writer.writerow([key, 0])