# For Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Import Packages :

In [2]:
# basic stuffs
import csv
import time
import sys
import os
import math
import random as rand
from typing import Dict

# other library
import numpy as np
import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split

# visualization tools
import tqdm
import matplotlib.pyplot as plt

# PyTorch library
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data 
from torch.utils.data import Dataset, DataLoader

# imbalanced
from imblearn.over_sampling import SVMSMOTE

# Fix Randomization Seed :

In [3]:
SEED = 5566 # Do not modify
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
rand.seed(SEED)
np.random.seed(SEED)

#Parameters :

In [4]:
TIME_FRAME_SIZE = 20
EPOCH_NUM = 300
batch_size = 3000
HIDEN_SIZE = 256
LR = 3.4e-4

#setting
pd.set_option('precision', 4)
pd.set_option("display.max_columns",100)

# load prerprocessed data :

In [5]:
train = pd.read_csv('/content/gdrive/MyDrive/Fintech/esun_dataset/training_data_complete_5.csv')
train_y = pd.read_csv('/content/gdrive/MyDrive/Fintech/esun_dataset/training_data_labels_5.csv').values
test = pd.read_csv('/content/gdrive/MyDrive/Fintech/esun_dataset/testing_data_complete_5.csv')
test_y = pd.read_csv('/content/gdrive/MyDrive/Fintech/esun_dataset/public_y_answer.csv').values
test_alert_keys = pd.read_csv('/content/gdrive/MyDrive/Fintech/esun_dataset/testing_alert_key_5.csv').values
all_alert_keys = pd.read_csv('/content/gdrive/MyDrive/Fintech/esun_dataset/sample_submission.csv')

In [6]:
train = train.drop('Unnamed: 0',axis=1)

In [7]:
non_time_related_cols = []
cols_1 = []
cols_2 = []
cols_3 = []
cols_4 = []
cols_5 = []
for col in train.columns:
  if col[-1]=='1':
    cols_1.append(col)
  elif col[-1]=='2':
    cols_2.append(col)
  elif col[-1]=='3':
    cols_3.append(col)
  elif col[-1]=='4':
    cols_4.append(col)
  elif col[-1]=='5':
    cols_5.append(col)
  else :
    non_time_related_cols.append(col)
cols_1 = cols_1+non_time_related_cols
cols_2 = cols_2+non_time_related_cols
cols_3 = cols_3+non_time_related_cols
cols_4 = cols_4+non_time_related_cols
cols_5 = cols_5+non_time_related_cols

In [8]:
# non_time_related_cols

In [9]:
df_1 = train[cols_1]
df_2 = train[cols_2]
df_3 = train[cols_3]
df_4 = train[cols_4]
df_5 = train[cols_5]

In [10]:
train_x = []
for i, row in df_1.iterrows():
  train_x.append(row.values) 
for i, row in df_2.iterrows():
  train_x[i]=np.append(train_x[i],row.values) 
for i, row in df_3.iterrows():
  train_x[i]=np.append(train_x[i],row.values) 
for i, row in df_4.iterrows():
  train_x[i]=np.append(train_x[i],row.values) 
for i, row in df_5.iterrows():
  train_x[i]=np.append(train_x[i],row.values) 
train_x = np.array(train_x)
train_x = np.reshape(train_x,[df_1.shape[0],5,df_1.shape[1]])

In [11]:
df_1 = test[cols_1]
df_2 = test[cols_2]
df_3 = test[cols_3]
df_4 = test[cols_4]
df_5 = test[cols_5]

In [12]:
test_x = []
for i, row in df_1.iterrows():
  test_x.append(row.values) 
for i, row in df_2.iterrows():
  test_x[i]=np.append(test_x[i],row.values) 
for i, row in df_3.iterrows():
  test_x[i]=np.append(test_x[i],row.values) 
for i, row in df_4.iterrows():
  test_x[i]=np.append(test_x[i],row.values) 
for i, row in df_5.iterrows():
  test_x[i]=np.append(test_x[i],row.values) 
test_x = np.array(test_x)
test_x = np.reshape(test_x,[df_1.shape[0],5,df_1.shape[1]])

In [13]:
train_y = train_y[:,1]
train_y = np.reshape(train_y,[train_y.size,1])

In [14]:
test_y = test_y[:,1]
test_y = np.reshape(test_y,[test_y.size,1])

In [15]:
train_x = np.nan_to_num(train_x, nan=0)

In [16]:
test_x = np.nan_to_num(test_x, nan=0)

# load dataset :

In [17]:
class Datasets(torch.utils.data.Dataset):
    def __init__(
        self,
        data
    ):  
        super(Datasets).__init__()
        self.input = torch.from_numpy(data[0]).float()
        self.targets = torch.from_numpy(data[1]).float()
        
    def __getitem__(self,index):
        return (self.input[index], self.targets[index])
    def __len__(self):
        return len(self.input)
    def collate_fn(self, samples):
        inputs = torch.vstack([sample[0] for sample in samples])
        targets = torch.vstack([sample[1] for sample in samples])
        return inputs,targets

In [18]:
train_x, val_x, train_y, val_y = train_test_split(train_x,train_y,test_size=0.2,stratify=train_y)
# val_x = train_x
# val_y = train_y

In [19]:
nsamples, nx, ny = train_x.shape
nsamples_, nx_, ny_ = test_x.shape
nsamples__, nx__, ny__ = val_x.shape

train_x = train_x.reshape((nsamples,nx*ny))
test_x = test_x.reshape((nsamples_,nx_*ny_))
val_x = val_x.reshape((nsamples__,nx__*ny__))
normalizer = Normalizer().fit(train_x)
train_x = normalizer.transform(train_x)
test_x = normalizer.transform(test_x)
val_x = normalizer.transform(val_x)
train_x = train_x.reshape((-1,nx,ny))
test_x = test_x.reshape((-1,nx_,ny_))
val_x = test_x.reshape((-1,nx__,ny__))

In [20]:
buf = train_x.reshape((nsamples,nx*ny))
train_x_over, train_y_over = SVMSMOTE().fit_resample(buf, train_y)
train_x_over = train_x_over.reshape((-1,nx,ny))

In [21]:
train_dataset = Datasets((train_x_over,train_y_over))
dev_dataset = Datasets((val_x,val_y))
test_dataset = Datasets((test_x,test_y))

output_dataset = Datasets((test_x,test_alert_keys))

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
output_loader = DataLoader(output_dataset, batch_size=batch_size, shuffle=False)

# LSTM :

In [22]:
class LSTM(torch.nn.Module):
  def __init__(self,input_size,hidden_size):

    super(LSTM,self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = 12

    self.LSTM_layer = nn.LSTM(input_size,hidden_size,num_layers=self.num_layers, batch_first=True, bidirectional=True)
    self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_size, nhead=2, dropout=0.1, batch_first=True)
    self.net = torch.nn.TransformerEncoder(self.encoder_layer, 7)
    self.DNN_classifier = torch.nn.Sequential(
                nn.BatchNorm1d(2*hidden_size),
                nn.Linear(2*hidden_size,256),
                nn.Linear(256,256),
                nn.Linear(256,2))    
  
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):

    x = self.net(x)
    h0 = torch.zeros(2*self.num_layers, x.shape[0], self.hidden_size).to(device)
    c0 = torch.zeros(2*self.num_layers, x.shape[0], self.hidden_size).to(device)
    x, (hn, cn) = self.LSTM_layer(x,(h0,c0))

    x = x[:,-1,:]
    x = self.relu(x)
    x = self.DNN_classifier(x)
    probabilities = torch.nn.functional.softmax(x)
    return probabilities[:,1]

In [23]:
class FScoreLoss(nn.Module):
  def __init__(self, beta=2, eps=1e-7):
    super(FScoreLoss, self).__init__()
    self.beta = beta
    self.eps = eps
  def forward(self, output, target):
    tp = (target * output).sum().to(torch.float32)
    fn = ((1 - target) * output).sum().to(torch.float32)
    fp = (target * (1 - output)).sum().to(torch.float32)

    precision = tp / (tp + fp + self.eps)
    recall = tp / (tp + fn + self.eps)

    f_score_loss = (1 + self.beta ** 2) * (precision * recall) / ((self.beta**2)*precision + recall + self.eps)
    return f_score_loss

In [24]:
# aquire np format
def recall_n(output, target):
    comb = list(zip(output, target))
    comb.sort(key=lambda x:x[0])
    flag = False
    for i, (out, gt) in enumerate(comb):
      try:
        if gt[0] == 1:
          if flag:
              break
          flag = True
      except:
        if gt == 1:
          if flag:
              break
          flag = True
    return (sum(np.squeeze(target))-1) / (len(target)-i)

In [None]:
model = LSTM(input_size=train_x.shape[2],hidden_size=HIDEN_SIZE)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
# optimizer = torch.optim.SGD(model.parameters(), lr=LR)
# loss_fn = FScoreLoss()
loss_fn = nn.BCELoss()
# loss_fn = nn.HingeEmbeddingLoss()

loss = []
recall = []
max_recall = 0

for epoch in range(EPOCH_NUM):
  # training
  model.train()
  batch_losses = []
  outputs = []
  targets = []
  for i, (x_batch, y_batch) in enumerate(train_loader):
    x_batch = x_batch.to(device)
    y_batch = y_batch.to(device)
    yhat = model(x_batch)
    batch_loss = loss_fn( yhat, y_batch)
    optimizer.zero_grad()
    batch_loss.backward()            
    optimizer.step()
    batch_losses.append(batch_loss.cpu().detach().numpy())
  train_loss = np.mean(batch_losses)
  with torch.no_grad():

    # valid
    outputs = []
    targets = []
    batch_val_losses = []
    for i, (x_val, y_val) in enumerate(dev_loader):
      x_val = x_val.to(device)
      y_val = y_val.to(device)
      model.eval()
      yhat = model(x_val).unsqueeze(1)
      val_loss = loss_fn(y_val, yhat)
      batch_val_losses.append(val_loss.cpu().detach().numpy())
      outputs += yhat.detach().cpu().numpy().tolist()
      targets += y_val.detach().cpu().numpy().tolist()
    valid_loss = np.mean(batch_val_losses)
    loss.append([train_loss,valid_loss])
    recall_val = recall_n(outputs, targets)

    # public test
    outputs = []
    targets = []
    for i, (x_test, y_test) in enumerate(test_loader):
      x_test = x_test.to(device)
      y_test = y_test.to(device)
      model.eval()
      yhat = model(x_test).unsqueeze(1)
      outputs += yhat.detach().cpu().numpy().tolist()
      targets += y_test.detach().cpu().numpy().tolist()
    recall_public = recall_n(outputs, targets)
    recall.append([recall_val,recall_public])

    # print("Epoch: {}, train Loss: {:.4f}".format(epoch + 1, np.mean(train_loss)))
    # print("Epoch: {}, valid Loss: {:.4f}".format(epoch + 1, np.mean(valid_loss)))
    print("Epoch: {}, Recall_n val: {:.4f}".format(epoch + 1, recall_val))
    print("Epoch: {}, Recall_n pub: {:.4f}".format(epoch + 1, recall_public))
    if recall_public > max_recall :
      torch.save(model,'model.pth')
      max_recall = recall_public
      print('save!')
    print()

# loss = pd.DataFrame(loss)
# loss.columns=(["train","validation"])
# loss.plot()

recall = pd.DataFrame(recall)
recall.columns=(["validation","public"])
recall.plot()

  probabilities = torch.nn.functional.softmax(x)


Epoch: 1, Recall_n val: 0.0100
Epoch: 1, Recall_n pub: 0.0075
save!

Epoch: 2, Recall_n val: 0.0106
Epoch: 2, Recall_n pub: 0.0081
save!

Epoch: 3, Recall_n val: 0.0096
Epoch: 3, Recall_n pub: 0.0085
save!



In [None]:
recall.iloc[:100].plot(grid = True)

In [None]:
recall = pd.DataFrame(recall)
recall.columns=(["validation","public"])
recall.plot()

In [None]:
recall.sort_values(['validation'],ascending=False).head(10)

In [None]:
recall.sort_values(['public'],ascending=False).head(10)

In [None]:
loss = pd.DataFrame(loss)
loss.columns=(["train","validation"])
loss.plot()

In [None]:
from google.colab import runtime
runtime.unassign()

In [None]:
model = model.to(device)
model.eval()

outputs = {}

for i, (x_test, alert_keys) in enumerate(output_loader):
  x_test = x_test.to(device)
  alert_keys = alert_keys
  output = model(x_test)
  output = output.detach().cpu().numpy().tolist()
  for alert_key, out in zip(alert_keys, output):
      outputs[alert_key[1].item()] = out

for i, row in all_alert_keys.iterrows():
  if row['alert_key'] not in outputs:
      outputs[row['alert_key']] = 0
    
submit = pd.DataFrame(
  data={
      'alert_key': list(outputs.keys()), 
      'probability': list(outputs.values())
  }
)

submit['alert_key'] = submit['alert_key'].astype(int)
submit.sort_values(by='probability',ascending=False, inplace=True)
submit.to_csv(f'submission.csv', index=None)