# DNN
Our result is generated by this script. The first line of each block briefly explain the code. 

### Install and import package

In [5]:
# !pip install pandas
# !pip install gdown

In [6]:
import numpy as np
import pandas as pd
import csv
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader, random_split
import math

### Data

In [7]:
# Download processed data

!gdown 1vHDhZSPmhithLVRRzkNw0ak_kk7PhInu # V_trade
!gdown 145T8z3XXlsaISzWdrJgJVANGokG1XuFI # V_remit
!gdown 1AubrUmeNUgpgiOu4tay6Gwl8O3lBaokF # V_info
!gdown 1zZo9RLt3mMmJZxEETSY2g9ND31qkZIn0 # V_cred
!gdown 1uFCx21bqE3FnrdfvN_mwtw2-nvogTEex # V_cons
!gdown 1ZOXGT_rIdEGIliHGKEH3ha77ZlZyq1Gn # train_y
!gdown 1qjEwmi97OWdshSNdgQj2ccXnoM4UvT25 # V_trade_public
!gdown 1g8trBiC6OxuoTU94u_UMygVrA-fSASpB # V_remit_public
!gdown 14KTfY56Mz2xBXdP27GvGB2HVeb_4Ks4T # V_info_public
!gdown 1EaIWnjQxUl4KRgVCqYT7AB4PaSNvc_GL # V_cred_public
!gdown 1owf1urxHZywAxJfCgVXCpXEMQ6VZGhnO # V_cons_public

Downloading...
From: https://drive.google.com/uc?id=1vHDhZSPmhithLVRRzkNw0ak_kk7PhInu
To: /workspace/V_trade.csv
100%|█████████████████████████████████████████| 230M/230M [00:00<00:00, 240MB/s]
Downloading...
From: https://drive.google.com/uc?id=145T8z3XXlsaISzWdrJgJVANGokG1XuFI
To: /workspace/V_remit.csv
100%|███████████████████████████████████████| 90.4M/90.4M [00:00<00:00, 149MB/s]
Downloading...
From: https://drive.google.com/uc?id=1AubrUmeNUgpgiOu4tay6Gwl8O3lBaokF
To: /workspace/V_info.csv
100%|████████████████████████████████████████| 557k/557k [00:00<00:00, 28.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zZo9RLt3mMmJZxEETSY2g9ND31qkZIn0
To: /workspace/V_cred.csv
100%|███████████████████████████████████████| 12.3M/12.3M [00:00<00:00, 186MB/s]
Downloading...
From: https://drive.google.com/uc?id=1uFCx21bqE3FnrdfvN_mwtw2-nvogTEex
To: /workspace/V_cons.csv
100%|█████████████████████████████████████████| 105M/105M [00:00<00:00, 179MB/s]
Downloading...
From: https://driv

In [8]:
# Read data as csv

V_cons = pd.read_csv('V_cons.csv').iloc[:, 1:]
V_cred = pd.read_csv('V_cred.csv').iloc[:, 1:]
V_info = pd.read_csv('V_info.csv').iloc[:, 1:]
V_remit = pd.read_csv('V_remit.csv').iloc[:, 1:]
V_trade = pd.read_csv('V_trade.csv').iloc[:, 1:]
train_y = pd.read_csv('train_y.csv').iloc[:, 1:]

V_cons_public = pd.read_csv('V_cons_public.csv').iloc[:, 1:]
V_cred_public = pd.read_csv('V_cred_public.csv').iloc[:, 1:]
V_info_public = pd.read_csv('V_info_public.csv').iloc[:, 1:]
V_remit_public = pd.read_csv('V_remit_public.csv').iloc[:, 1:]
V_trade_public = pd.read_csv('V_trade_public.csv').iloc[:, 1:]

public_x_alert_date = pd.read_csv('public_x_alert_date.csv')
all_keys = pd.read_csv('all_keys.csv')

In [9]:
# concatenate dataframes to get the entire training/testing data
# some values are missing, fill them with 0

V_overall = pd.concat([V_info, V_cred, V_cons, V_remit, V_trade], axis=1).fillna(0)
V_overall_public = pd.concat([V_info_public, V_cred_public, V_cons_public, V_remit_public, V_trade_public], axis=1).fillna(0)

In [10]:
# verify the shape of dataframes

print(V_info.shape)
print(V_cred.shape)
print(V_cons.shape)
print(V_remit.shape)
print(V_trade.shape)
print(V_overall.shape)
print()
print(V_info_public.shape)
print(V_cred_public.shape)
print(V_cons_public.shape)
print(V_remit_public.shape)
print(V_trade_public.shape)
print(V_overall_public.shape)
print()
print(train_y.shape)
print()
print(public_x_alert_date.shape)
print(all_keys.shape)

(23906, 4)
(23906, 117)
(23906, 1965)
(23906, 1572)
(23906, 3537)
(23906, 7195)

(1845, 4)
(1845, 117)
(1845, 1965)
(1845, 1572)
(1845, 3537)
(1845, 7195)

(23906, 1)

(1845, 2)
(25751, 1)


### Settings
There are three choice of cost function. The best result is generated by "linear". 

In [11]:
# set parameters

n_epoch = 33  # number of epochs
batch = 128  # batch size
lr = 0.0000001  # learning rate
w = 1  # penalty weight for false negative
d = 99  # duplicate d times for SAR_flag == 1 (oversampling)

In [12]:
# define network

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.network = nn.Sequential( # 7195 -> 1
            nn.Linear(7195, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid() # last one must be sigmoid 
        )
                
    def forward(self, x):
        x = self.network(x)
        return x

In [13]:
# loss function (linear)

def loss_function(prob, ans):
    return (w * (1 - prob) * ans + (prob) * (1 - ans))

In [14]:
# loss function (quadratic)

# def loss_function(prob, ans):
#     # a * x**n
#     a = 2
#     n = 2
#     return (w * a * (1 - prob)**n * ans + a * (prob)**n * (1 - ans))

In [15]:
# loss function (log)

# prob = torch.minimum(prob, torch.full((prob.size(dim=0), 1), 0.9999999).to(device))
# prob = torch.maximum(prob, torch.full((prob.size(dim=0), 1), 0.0000001).to(device))
# return (w * torch.log(1 - prob) * ans + torch.log(prob) * (1 - ans))

### Main

In [16]:
SEED = 1234
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)

In [17]:
# training process

def train(train_data, val_data, model, n_epoch, batch, lr, device):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    best_loss = 1000000
    for epoch in range(n_epoch):
        total_loss = 0
        model.train()
        idx = 0
        for data, ans in train_data:
            data, ans = data.to(device), ans.to(device)
            prob = model(data)
            loss = torch.sum(loss_function(prob, ans)) / batch
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += (loss.item() / len(train_data))
            print('[Epoch %d | %d/%d] loss: %.4f' % ((epoch+1), idx*batch, len(train_data) * batch, loss.item()), end='\r')
            idx += 1
        print("\n  Training  | Loss:%.4f " % total_loss)

        # validation
        model.eval()
        total_loss = 0
        idx = 0 
        with torch.no_grad():
            for data, ans in val_data:
                data, ans = data.to(device), ans.to(device)
                prob = model(data)
                loss = torch.sum(loss_function(prob, ans)) / batch
                total_loss += (loss.item() / len(val_data))
                idx += 1
            print(" Validation | Loss:%.4f " % total_loss)
            
        torch.save(model.state_dict(), "%s" % "model.pth")
    return model

In [18]:
# Define the whole dataset
# Oversampling: we duplicate the samples with SAR_flag=1 for d times
# d=99 let the number of samples of flag=0 and flag=1 matches (SAR_rate = 0.5)

class TrainDataset(Dataset):
    def __init__(self):
        # oversampling
        V_overall_list = V_overall.values.tolist()
        train_y_list = train_y.values.tolist()
        l = len(train_y_list)
        s = sum(sum(train_y_list,[]))
        print("Before oversampling: total=:", l, "flag=0:", l - s, "flag=1:", s, "SAR_rate=", s / l)        
        for i in range(l):
            if train_y_list[i][0] == 1:
                V_overall_list.extend([V_overall_list[i] for j in range(d)])
                train_y_list.extend([train_y_list[i] for j in range(d)])
        l = len(train_y_list)
        s = sum(sum(train_y_list,[]))
        print("After oversampling: total=:", l, "flag=0:", l - s, "flag=1:", s, "SAR_rate=", s / l)    
        
        self.X = torch.tensor(V_overall_list).to(torch.float32)
        self.Y = torch.tensor(train_y_list).to(torch.float32)
  
    def __len__(self):
        return self.X.size(0)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

In [19]:
# Define testing dataset

class TestDataset(Dataset):
    def __init__(self):
        self.X = torch.tensor(V_overall_public.values).to(torch.float32)

    def __len__(self):
        return self.X.size(0)

    def __getitem__(self, idx):
        return self.X[idx]

Please note that, since we do oversampling with a huge multiplicity (d=99), if we split the whole data set into training set and validation set randomly, many samples will occur in both set, so the validation is biased. Therefore, we do not do validation here. 

However, we keep the validation part in the script anyway. In the block below, the validation set is set to the same as training set. You can just ignore it. During training process, we evaluate it but do not use it to determine when to stop the programm since the validation loss is meaningless. 

In [20]:
trainset = TrainDataset()
train_dataloader = DataLoader(trainset, batch, True)
val_dataloader = DataLoader(trainset, batch, True)

Before oversampling: total=: 23906 flag=0: 23672 flag=1: 234 SAR_rate= 0.009788337655818623
After oversampling: total=: 47072 flag=0: 23672 flag=1: 23400 SAR_rate= 0.4971108089734874


In [21]:
# start training! 

device = 'cuda:0'
model = Net().to(device) 
model = train(train_dataloader, val_dataloader, model, n_epoch, batch, lr, device)

[Epoch 1 | 46976/47104] loss: 0.3203
  Training  | Loss:0.4802 
 Validation | Loss:0.4789 
[Epoch 2 | 46976/47104] loss: 0.3640
  Training  | Loss:0.4775 
 Validation | Loss:0.4759 
[Epoch 3 | 46976/47104] loss: 0.3712
  Training  | Loss:0.4746 
 Validation | Loss:0.4728 
[Epoch 4 | 46976/47104] loss: 0.2969
  Training  | Loss:0.4712 
 Validation | Loss:0.4692 
[Epoch 5 | 46976/47104] loss: 0.3868
  Training  | Loss:0.4671 
 Validation | Loss:0.4635 
[Epoch 6 | 46976/47104] loss: 0.3781
  Training  | Loss:0.4618 
 Validation | Loss:0.4602 
[Epoch 7 | 46976/47104] loss: 0.2877
  Training  | Loss:0.4591 
 Validation | Loss:0.4578 
[Epoch 8 | 46976/47104] loss: 0.2974
  Training  | Loss:0.4567 
 Validation | Loss:0.4553 
[Epoch 9 | 46976/47104] loss: 0.3886
  Training  | Loss:0.4537 
 Validation | Loss:0.4521 
[Epoch 10 | 46976/47104] loss: 0.3698
  Training  | Loss:0.4506 
 Validation | Loss:0.4486 
[Epoch 11 | 46976/47104] loss: 0.3312
  Training  | Loss:0.4467 
 Validation | Loss:0.444

### Predict

In [22]:
# load model and predict testing data

best_model = model
best_model.load_state_dict(torch.load("model.pth"))
best_model = best_model.eval()

testset = TestDataset()
test_dataloader = DataLoader(testset, 1, False)
result = []
for x in test_dataloader:
    x = x.to(device)
    result.append(best_model(x).item())

In [23]:
# generate (key, probability) pairs

keys_to_predict = sorted(public_x_alert_date['alert_key'].values.tolist())
pairs = np.array(list(zip(keys_to_predict, result)))
sorted_pairs = np.flip(pairs[pairs[:, 1].argsort()], 0)

In [24]:
# generate output file 

example_keys = []
with open('example.csv', newline='') as example:
    rows = csv.reader(example)
    headers = next(rows)
    for row in rows:
        example_keys.append(int(row[0]))
        
with open('predict.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['alert_key','probability'])
    for row in sorted_pairs:
        writer.writerow([int(row[0]), row[1]])
    for key in example_keys:
        if key not in keys_to_predict:
            writer.writerow([key, 0])

### Score
In this part we load the answer of public testcase published by the competition just to calculate the score. The answer is not used to train the model. 

In [25]:
ESun_public_y_answer = pd.read_csv('ESun_public_y_answer.csv')

index_list = []
SAR_count = 0
for key, flag in ESun_public_y_answer.values.tolist():
    if flag == 1:
        SAR_count += 1
        for idx in range(len(sorted_pairs)):
            if key == sorted_pairs[idx][0]:
                index_list.append(idx + 1)
                break
index_list.sort()
print("score: ", str((SAR_count - 1) / index_list[-2]))

score:  0.010351966873706004
