In [46]:
import numpy as np
import pandas as pd
import csv
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import math

# Read data

In [47]:
# !gdown 1vHDhZSPmhithLVRRzkNw0ak_kk7PhInu
# !gdown 145T8z3XXlsaISzWdrJgJVANGokG1XuFI
# !gdown 1AubrUmeNUgpgiOu4tay6Gwl8O3lBaokF
# !gdown 1zZo9RLt3mMmJZxEETSY2g9ND31qkZIn0
# !gdown 1uFCx21bqE3FnrdfvN_mwtw2-nvogTEex
# !gdown 1ZOXGT_rIdEGIliHGKEH3ha77ZlZyq1Gn

In [48]:
train_y = pd.read_csv('train_y.csv').iloc[:, 1:]
V_cons = pd.read_csv('V_cons.csv').iloc[:, 1:]
V_cred = pd.read_csv('V_cred.csv').iloc[:, 1:]
V_info = pd.read_csv('V_info.csv').iloc[:, 1:]
V_remit = pd.read_csv('V_remit.csv').iloc[:, 1:]
V_trade = pd.read_csv('V_trade.csv').iloc[:, 1:]
public_ans = pd.read_csv('ESun_public_y_answer.csv')
all_keys = pd.read_csv('all_keys.csv')

In [49]:
V_overall = pd.concat([V_info, V_cred, V_cons, V_remit, V_trade], axis=1)

In [50]:
print(train_y.shape)
print(V_info.shape)
print(V_cred.shape)
print(V_cons.shape)
print(V_remit.shape)
print(V_trade.shape)
print(V_overall.shape)
print(public_ans.shape)
print(all_keys.shape)

(23906, 1)
(23906, 4)
(23906, 117)
(23906, 1965)
(23906, 1572)
(23906, 3537)
(23906, 7195)
(1845, 2)
(25751, 1)


In [51]:
V_info

Unnamed: 0,0,1,2,3
0,3,12.0,241719.0,3
1,3,13.0,599497.0,6
2,1,19.0,51160.0,4
3,3,9.0,3634343.0,6
4,1,17.0,4076287.0,4
...,...,...,...,...
23901,1,17.0,12207.0,2
23902,1,17.0,259985.0,4
23903,3,19.0,928963.0,3
23904,3,19.0,21647.0,4


In [52]:
public_ans

Unnamed: 0,alert_key,sar_flag
0,352342,0
1,352866,0
2,352696,0
3,352330,0
4,352683,0
...,...,...
1840,364472,0
1841,364788,0
1842,364673,0
1843,364626,0


In [53]:
all_keys

Unnamed: 0,alert_key
0,352249
1,352253
2,352254
3,352280
4,352282
...,...
25746,352123
25747,352124
25748,352125
25749,352128


# Main

In [138]:
# set parameters
n_epoch = 20
batch = 1 #
lr = 0.01
w = 5

In [139]:
# define network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.network = nn.Sequential( # 7195 -> 1
            nn.Linear(7195, 1024),
            nn.Sigmoid(),
            nn.Linear(1024, 256),
            nn.Sigmoid(),
            nn.Linear(256, 64),
            nn.Sigmoid(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
                
    def forward(self, x):
        #print(x)
        x = self.network(x)
        return x

In [140]:
def loss_function(prob, ans):
    #print(prob, ans)
    # return - w * math.log(1 - prob) * ans - math.log(prob) * (1 - ans)
    return - w * (1 - prob) * ans - (prob) * (1 - ans)

In [141]:
def train(train_data, val_data, model, n_epoch, batch, lr, device):
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    best_loss = 1000000
    for epoch in range(n_epoch):
        total_loss = 0
        model.train()
        idx = 0
        for data, ans in train_data:
            data, ans = data.to(device), ans.to(device)
            #print(data, ans)
            prob = model(data)
            loss = loss_function(prob, ans)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += (loss.item() / len(train_data))
            print('[Epoch %d | %d/%d] loss: %.4f' % ((epoch+1), idx*batch, len(train_data) * batch, loss.item()), end='\r')
            idx += 1
        print("\n  Training  | Loss:%.4f " % total_loss)

        # validation set
        model.eval()
        total_loss = 0
        idx = 0 
        with torch.no_grad():
            for data, ans in val_data:
                data, ans = data.to(device), ans.to(device)
                prob = model(data)
                loss = loss_function(prob, ans)
                total_loss += (loss.item() / len(val_data))
                idx += 1
            print(" Validation | Loss:%.4f " % total_loss)
        # save model
        if total_loss < best_loss:
                best_loss = total_loss
                print("saving model with loss %.4f...\n" % total_loss)
                torch.save(model.state_dict(), "%s" % "model.pth")
    return model

In [142]:
class TrainDataset(Dataset):
    def __init__(self):
        self.X = torch.tensor(V_overall.values).to(torch.float32)
        self.Y = torch.tensor(train_y.values).to(torch.float32)
  
    def __len__(self):
        return self.X.size(0)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

# class ValDataset(Dataset):
#     def __init__(self):
#         keys_to_predict = public_ans['alert_key'].values.tolist()
#         all_keys_sorted = sorted(all_keys['alert_key'].values.tolist())
#         print(all_keys_sorted)
#         X = []
#         for target in keys_to_predict:
#             print(target)
#             idx = all_keys_sorted.index(target)
#             print(idx)
#             X.append(V_overall.iloc[idx].values.tolist())
#         Y = public_ans['sar_flag'].values.tolist()
#         self.X = torch.tensor(X).to(torch.float32)
#         self.Y = torch.tensor(Y).to(torch.float32)

#     def __len__(self):
#         return self.X.size(0)

#     def __getitem__(self, idx):
#         return self.X[idx], self.Y[idx]

In [143]:
trainset = TrainDataset()
# valset = ValDataset()
train_dataloader = DataLoader(trainset, batch, True)
# val_dataloader = DataLoader(valset, batch, False)

In [144]:
device = 'cuda:0'
model = Net().to(device) 
# model = train(train_dataloader, val_dataloader, model, n_epoch, batch, lr, device)
model = train(train_dataloader, train_dataloader, model, n_epoch, batch, lr, device)

[Epoch 1 | 23905/23906] loss: nan
  Training  | Loss:nan 


NameError: name 'val_x' is not defined

# Result

In [None]:
best_model = model
best_model.load_state_dict(torch.load("model.pth"))
best_model = best_model.eval()

result = []
for x, _ in val_dataloader:
    x = x.to(device)
    result.append(best_model(x))

In [None]:
keys_to_prerdict = public_ans['alert_key'].values.tolist()
pairs = np.array(zip(keys_to_predict, result))
sorted_pairs = np.flip(pairs[pairs[:, 2].argsort()], 0)
print(sorted_pairs)

In [None]:
index_list = []
SAR_count = 0
for key, flag in public_ans.iterrows():
    if flag == 1:
        SAR_count += 1
        for idx in range(len(sorted_pairs)):
            if key == sorted_pairs[idx][0]:
                index_list.append(idx + 1)
                break
print(len(index_list) == SAR_count)
index_list.sort()
print(index_list[-2])
print((SAR_count - 1) / index_list[-2])