# For Google Drive

In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Import Packages :

In [2]:
# basic stuffs
import csv
import time
import sys
import os
import math
import random as rand
from typing import Dict

# other library
import numpy as np
import pandas as pd

# visualization tools
import tqdm
import matplotlib.pyplot as plt

# PyTorch library
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils import data 
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


# Fix Randomization Seed :

In [3]:
SEED = 42 # Do not modify
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
rand.seed(SEED)
np.random.seed(SEED)

#Parameters :

In [4]:
TIME_FRAME_SIZE = 5

#setting
pd.set_option('precision', 4)
pd.set_option("display.max_columns",100)

# load data from google cloud :

In [3]:
# Youchen datapath

# ccba = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/ccba.csv')
# custinfo = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/custinfo.csv')

# cdtx = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/cdtx.csv')
# dp = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/dp.csv')
# remit1 = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/remit.csv')

train_alert_date = pd.read_csv('../Preprocessing/dataset/train_x_alert_date.csv')
y = pd.read_csv('../Preprocessing/dataset/train_y_answer.csv')

training_data = pd.read_csv('./training_data_complete_5.csv')
training_label = pd.read_csv('./training_data_labels_5.csv')

testing_data = pd.read_csv('./testing_data_complete_5.csv')
testing_key = pd.read_csv('./testing_alert_key_5.csv')
# testing_ans = pd.read_csv('/content/gdrive/MyDrive/Fintech_final/public_y_answer.csv')

sample_output = pd.read_csv('../Preprocessing/dataset/sample_output.csv')

# testing_key = pd.merge(testing_key, testing_ans)

In [105]:
testing_key.head()

Unnamed: 0.1,Unnamed: 0,alert_key,sar_flag
0,0,352342,0
1,1,352866,0
2,2,352696,0
3,3,352330,0
4,4,352683,0


# Perform Onehot Pooling 

In [106]:
# Drop columns
drop_cols = ['Unnamed: 0', 
      'remit_transtime_diff_1',
      'remit_transtime_diff_2',
      'remit_transtime_diff_3',
      'remit_transtime_diff_4',
      'remit_transtime_diff_5',
      'remit_transtime_avg'
      ]
      
train = training_data.drop(columns=drop_cols)
test = testing_data.drop(columns=drop_cols)

train_label = training_label.drop(columns='Unnamed: 0')
test_keys = testing_key.drop(columns='Unnamed: 0')

In [107]:
# Determine categorical column names
cat_cols = ['occupation_code',
       'cur_type_1', 'country_1', 
       'cur_type_2', 'country_2', 
       'cur_type_3', 'country_3', 
       'cur_type_4', 'country_4', 
       'cur_type_5', 'country_5', 
       'debit_credit_1', 'tx_type_1', 'info_asset_code_1', 'fiscTxId_1', 'cross_bank_1', 'ATM_1', 'txbranch_1', 
       'debit_credit_2', 'tx_type_2', 'info_asset_code_2', 'fiscTxId_2', 'cross_bank_2', 'ATM_2', 'txbranch_2', 
       'debit_credit_3', 'tx_type_3', 'info_asset_code_3', 'fiscTxId_3', 'cross_bank_3', 'ATM_3', 'txbranch_3', 
       'debit_credit_4', 'tx_type_4', 'info_asset_code_4', 'fiscTxId_4', 'cross_bank_4', 'ATM_4', 'txbranch_4', 
       'debit_credit_5', 'tx_type_5', 'info_asset_code_5', 'fiscTxId_5', 'cross_bank_5', 'ATM_5', 'txbranch_5', 
       'trans_no_1', 'trans_no_2', 'trans_no_3', 'trans_no_4', 'trans_no_5'
       ]

# Determine pooling column names
pool_cols = ['cur_type', 'debit_credit', 'tx_type', 'info_asset_code', 'txbranch', 'country', 
        'fiscTxId', 'cross_bank', 'ATM', 'trans_no']

In [108]:
# Get train/test onehot
train[cat_cols] = train[cat_cols].astype(int)
train_onehot = pd.get_dummies(train, columns=cat_cols)

test[cat_cols] = test[cat_cols].astype(int)
test_onehot = pd.get_dummies(test, columns=cat_cols)

In [None]:
# Get existing onehot keys -> count
#  'debit_credit': [0, 1],
#  'tx_type': [1, 2, 3]
count = {}
for pool in pool_cols:
  first = True
  for c in train.columns:
    if pool in c:
      index = train[c].value_counts().index
      if first:
        count[pool] = []
        for i in range(len(index)):
          count[pool].append(index[i])
        first = False
      else:
        for i in range(len(index)):
          if index[i] not in count[pool]:
            count[pool].append(index[i])
  count[pool] = sorted(count[pool])
count

In [110]:
# Training set Pooling
train_col_names = []
train_col_datas = []
for key in count.keys():
  for value in count[key]:
    temp = None
    temp_name = key+'_'+str(value)
    for i in range(1, 6):
      col_name = key+'_'+str(i)+'_'+str(value)
      # if col_name not in train_onehot.columns:
      #   continue
      try:
        if temp is None:
          temp = train_onehot[col_name].copy()
        else:
          temp += train_onehot[col_name].copy()
      except:
        pass
    if temp is not None:
      train_col_datas.append(temp.to_numpy())
    else:
      train_col_datas.append(np.zeros(len(test)))
    train_col_names.append(temp_name)

# temp.to_numpy()
# temp_name

In [111]:
# Testing set Pooling
test_col_names = []
test_col_datas = []
for key in count.keys():
  for value in count[key]:
    temp = None
    temp_name = key+'_'+str(value)
    for i in range(1, 6):
      col_name = key+'_'+str(i)+'_'+str(value)
      # if col_name not in train_onehot.columns:
      #   continue
      try:
        if temp is None:
          temp = test_onehot[col_name].copy()
        else:
          temp += test_onehot[col_name].copy()
      except:
        pass
    if temp is not None:
      test_col_datas.append(temp.to_numpy())
    else:
      test_col_datas.append(np.zeros(len(test)).astype(int))
    test_col_names.append(temp_name)

In [112]:
# Categorical features
all_cat = pd.DataFrame(np.array(train_col_datas).T, columns=train_col_names)
test_cat = pd.DataFrame(np.array(test_col_datas).T, columns=test_col_names)

# Numerical features
all_num = train.drop(columns=cat_cols)
test_num = test.drop(columns=cat_cols)

# To ensure same dim when rerunning
all_label = training_label.drop(columns='Unnamed: 0')

# Train/val split
from sklearn.model_selection import train_test_split
train_cat, val_cat, train_num, val_num, train_label, val_label \
      = train_test_split(all_cat, all_num, all_label, test_size=0.2, random_state=42)

In [113]:
# Normalization of numerical feats
def normalize(X, preMax=None, preMin=None, is_train=True):
  if is_train:
    Max = X.max()
    Min = X.min()
    X_norm = (X - Min) / (Max - Min)
    
    return X_norm, Max, Min
  else:
    X_norm = (X - preMin) / (preMax - preMin)
    
    return X_norm, preMax, preMin

train_num_norm, train_max, train_min = normalize(train_num, is_train=True)
val_num_norm, _, _ = normalize(val_num, preMax=train_max, preMin = train_min, is_train=False)
test_num_norm, _, _ = normalize(test_num, preMax=train_max, preMin = train_min, is_train=False)

all_num_norm, _, _ = normalize(all_num, is_train=True)

In [114]:
train_set = pd.merge(train_cat.reset_index(), train_num_norm.reset_index()).drop(columns='index')
val_set = pd.merge(val_cat.reset_index(), val_num_norm.reset_index()).drop(columns='index')
test_set = pd.merge(test_cat.reset_index(), test_num_norm.reset_index()).drop(columns='index')

all_set = pd.merge(all_cat.reset_index(), all_num_norm.reset_index()).drop(columns='index')

# Resampling

In [None]:
train_x_y = pd.concat([train, train_label], axis=1)
train_x_y['labels'].value_counts()

0.0    18898
1.0      226
Name: labels, dtype: int64

In [None]:
from sklearn.utils import resample

def resampling(data, y_col, ratio='100_100'):
  ##################################################
  #  data: target DataFrame          #
  #  y_col: the name of y column     　#
  #  ratio: expected ratio of two classes .#
  ##################################################
  # String process
  [ratio_0, ratio_1] = ratio.split('_')
  ratio_0, ratio_1 = int(ratio_0), int(ratio_1)
  num_0, num_1 = data[y_col].value_counts()
  
  # Initialization
  n_samples = [0, 0]
  group = {}

  # Num of samples of class 0/1
  # (Switch n_samples values if minor class is 0)
  max_n = max(num_0, num_1)    # Bigger number of sample
  seg = max_n // 100       # Cut this number into 100 pieces
  n_samples[0] = num_0 if ratio_0 == 100 \
              else seg * ratio_0  # Use all samples if ratio_0 == 100, downsample if not
  n_samples[1] = seg * ratio_1  # Oversampling of minor class
  
  # Resample
  for i in [0, 1]:
    g = data[data[y_col] == i]
    group[str(i)] = resample(g, replace=True, n_samples=n_samples[i])

  # Concat two class into a DataFrame
  # (Shuffle as you wish)
  up = pd.concat(group.values())

  return up

# XGBoost

## Data

In [None]:
# upsampled_data = resampling(train_set, 'labels', ratio='70_50')
# upsampled_data['labels'].value_counts()

In [16]:
!pip install xgboost==1.7.2
import xgboost as xgb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost==1.7.2
  Downloading xgboost-1.7.2-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[K     |████████████████████████████████| 193.6 MB 80 kB/s 
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.7.2


In [None]:
# Reassign if resampled
# train_set = upsampled_data

In [115]:
train_X, train_y = train_set, train_label
val_X, val_y = val_set, val_label

all_X, all_y = all_set, all_label

## Test Chi2 (NO USE)

In [None]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import chi2

# chi2_selector = SelectKBest(chi2, k=250)
# X_kbest = chi2_selector.fit_transform(all_X, all_y)
# test_kbest = chi2_selector.transform(test_set)
# # print(X_kbest.shape)
# # print(chi2_selector.get_feature_names_out())

## Training

In [116]:
from sklearn.metrics import recall_score

In [117]:
def recall_n(output, target):

  comb = list(zip(output, target))
  comb.sort(key=lambda x:x[0])
  flag = False
  for i, (out, gt) in enumerate(comb):
      if gt == 1:
          if flag:
              break
          flag = True
  return (sum(target)-1) / (len(target)-i)

In [139]:
# XGBoost grid search
params = {
        'n_estimators': [400, 600, 800, 1000],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [142]:
from sklearn.feature_selection import SelectFromModel

n_rand = 20
best_recall = -1
best_model = None
best_params = None
train_used = train_X
val_used = val_X
test_used =  test_set

# Grid search
for i in range(n_rand):
  print(f'========== Trying {i+1}/{n_rand} ==========')
  # Get random hyps
  param_copy = params.copy()
  for key in params:
    param_copy[key] = params[key][np.random.randint(0, len(params[key]))]
  print(param_copy)

  # Train XGB
  xgbModel = xgb.XGBClassifier(max_delta_step=1, random_state=0)
  xgbModel.set_params(**param_copy)
  xgbModel.fit(train_used, train_y)

  # Validation
  val_pred = xgbModel.predict_proba(val_used)
  current_recall = recall_n(val_pred[:, 1].reshape(-1, 1), val_y.to_numpy())
  print('Val Recall_n =', current_recall)

  # Test
  test_pred = xgbModel.predict_proba(test_used)
  test_recall = recall_n(test_pred[:, 1].reshape(-1, 1), test_keys['sar_flag'].to_numpy())
  print('Test Recall =', test_recall)

  # Substitution
  if test_recall > best_recall:
    best_recall = test_recall
    best_model = xgbModel
    best_params = param_copy
    print("Updated")

{'n_estimators': 400, 'min_child_weight': 5, 'gamma': 1.5, 'subsample': 1.0, 'colsample_bytree': 1.0, 'max_depth': 5}
Val Recall_n = [0.02256532]
Test Recall = 0.010526315789473684
Updated
{'n_estimators': 800, 'min_child_weight': 10, 'gamma': 1.5, 'subsample': 1.0, 'colsample_bytree': 0.6, 'max_depth': 4}
Val Recall_n = [0.01536388]
Test Recall = 0.01179245283018868
Updated
{'n_estimators': 600, 'min_child_weight': 5, 'gamma': 1.5, 'subsample': 1.0, 'colsample_bytree': 1.0, 'max_depth': 3}
Val Recall_n = [0.02583862]
Test Recall = 0.011933174224343675
Updated
{'n_estimators': 800, 'min_child_weight': 1, 'gamma': 2, 'subsample': 0.8, 'colsample_bytree': 1.0, 'max_depth': 3}
Val Recall_n = [0.01751152]
Test Recall = 0.012903225806451613
Updated
{'n_estimators': 400, 'min_child_weight': 10, 'gamma': 1, 'subsample': 0.8, 'colsample_bytree': 0.6, 'max_depth': 3}
Val Recall_n = [0.02439024]
Test Recall = 0.016
Updated
{'n_estimators': 400, 'min_child_weight': 1, 'gamma': 2, 'subsample': 0.8

In [119]:
xgbModel = xgb.XGBClassifier(max_delta_step=1, random_state=0)
# xgbModel.set_params(**param_copy)
xgbModel.fit(train_X, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=1, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [27]:
# XGBoost train + val

xgbModel = xgb.XGBClassifier(max_delta_step=1, random_state=0)
xgbModel.set_params(**{'n_estimators': 600, 'min_child_weight': 5, 'gamma': 1, 'subsample': 0.8, 'colsample_bytree': 0.6, 'max_depth': 5})
xgbModel.fit(all_X, all_y)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=1, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=1, max_depth=5, max_leaves=0, min_child_weight=5,
              missing=nan, monotone_constraints='()', n_estimators=600,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [135]:
test_keys['alert_key'].to_numpy()

array([352342, 352866, 352696, ..., 364673, 364626, 364986])

In [133]:
test_pred = best_model.predict_proba(test_used)
test_recall = recall_n(test_pred[:, 1].reshape(-1, 1), test_keys['sar_flag'].to_numpy())
test_recall

0.022935779816513763

In [136]:
# Predict probability
output = []

# for i, _x in enumerate(random_search.predict_proba(test)):
for i, _x in enumerate(best_model.predict_proba(test_set)):
    output.append([test_keys['alert_key'].to_numpy()[i], _x[1]])
output = sorted(output, reverse=True, key= lambda s: s[1])
print(output)

# 考慮private alert key部分，滿足上傳條件
public_private_alert_key = sample_output['alert_key'].values
print(len(public_private_alert_key))

# For alert key not in public, add zeros
for key in public_private_alert_key:
  # print(key)
  if key not in test_keys['alert_key'].to_numpy():
    output.append([key, 0])

print(len(output))

predict_alert_key, predict_probability = [], []
for key, prob in output:
  predict_alert_key.append(key)
  predict_probability.append(prob)

df_predicted = pd.DataFrame({
    "alert_key": predict_alert_key,
    "probability": predict_probability
})

df_predicted.to_csv('prediction_baseline.csv', index=False)

[[363337, 0.17530043], [358988, 0.1666941], [364626, 0.15636669], [361145, 0.14563857], [358005, 0.13293348], [358457, 0.1297754], [357510, 0.12756294], [355198, 0.12202708], [355633, 0.121297345], [356628, 0.120788], [361299, 0.11613604], [361569, 0.11480286], [353084, 0.10816047], [353872, 0.107232906], [353566, 0.10686824], [353582, 0.10650947], [356388, 0.106335245], [360041, 0.106335245], [364698, 0.1047027], [363033, 0.100562796], [361011, 0.09870499], [360839, 0.09808823], [352501, 0.097785465], [354449, 0.09722119], [357098, 0.09562195], [364223, 0.09480139], [355801, 0.09192406], [363168, 0.09166245], [361303, 0.08876661], [358467, 0.08730461], [358235, 0.086821206], [353413, 0.08635598], [364332, 0.079964735], [355724, 0.07977168], [362483, 0.079552226], [359384, 0.078546524], [354814, 0.07747777], [357788, 0.07692047], [363446, 0.07601278], [354045, 0.074970625], [358229, 0.07462394], [352700, 0.07380451], [354143, 0.07341481], [353494, 0.07308405], [364633, 0.07308254], [35