In [1]:
import numpy as np
import pandas as pd
import importlib
import sys
# import keras
from tqdm import tqdm
import pickle
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.optimizers import SGD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.utils import class_weight
from sklearn.utils import resample
from collections import defaultdict
from collections import Counter

from classifiers.cnn2d import Classifier_CNN2D
from classifiers.fcn2d import Classifier_FCN2D
from classifiers.resnet import Classifier_RESNET
importlib.reload(sys.modules[Classifier_CNN2D.__module__])
importlib.reload(sys.modules[Classifier_FCN2D.__module__])
importlib.reload(sys.modules[Classifier_RESNET.__module__])

<module 'classifiers.resnet' from '/home/aasleptsov98/Coursework/cnn_vs_rnn/classifiers/resnet.py'>

In [2]:
data_name = "pos_cash.data.pkl"

In [3]:
app_train = pd.read_csv("input/application_train.csv")
pos_cash_balance = pd.read_csv("input/POS_CASH_balance.csv")

In [4]:
pos_cash_balance = pos_cash_balance.fillna(0)

In [5]:
ohe = preprocessing.OneHotEncoder()
categories = np.array(list(set(pos_cash_balance["NAME_CONTRACT_STATUS"].astype(str).values))).reshape(-1,1)
ohe.fit(categories)

OneHotEncoder()

In [6]:
encoded = np.array(ohe.transform(pos_cash_balance["NAME_CONTRACT_STATUS"].values.reshape(-1,1)).todense())
temp_df = pd.DataFrame(encoded, columns=range(encoded.shape[1]))

In [7]:
pos_cash_balance = pd.concat([pos_cash_balance.drop(["NAME_CONTRACT_STATUS"], axis=1), temp_df], axis=1)

In [11]:
n_timesteps = 40
n_credits = 5
n_features = pos_cash_balance.shape[1] - 2

In [9]:
targets = dict()
for index, row in tqdm(app_train.iterrows()):
    targets[row["SK_ID_CURR"]] = row["TARGET"]

307511it [00:54, 5596.45it/s]


In [13]:
credits_X = defaultdict(list)
credits_y = defaultdict(int)

for id_cur, df_id_cur in tqdm(pos_cash_balance.groupby(by=["SK_ID_PREV", "SK_ID_CURR"])):
    if len(df_id_cur) > 0 and id_cur[1] in targets:
        credits_X[id_cur[1]].append(df_id_cur.sort_values(by=['MONTHS_BALANCE'])[-n_timesteps:].drop(["SK_ID_PREV", "SK_ID_CURR"], axis=1).values)
        credits_y[id_cur[1]] = targets[id_cur[1]]

100%|██████████| 936325/936325 [24:37<00:00, 633.59it/s] 


In [None]:
X = []
y = []

for key in credits_X.keys():
    X.append(credits_X[key])
    y.append(credits_y[key])

In [18]:
with open(data_name, "wb") as f:
    pickle.dump((X, y), f)

In [20]:
for i in range(len(X)):
    for j in range(len(X[i])):
        X[i][j] = np.append(np.zeros((max(n_timesteps - X[i][j].shape[0], 0), X[i][j].shape[1]), dtype=int), X[i][j], axis=0)
    for j in range(n_credits - len(X[i])):
        X[i].append(np.zeros((n_timesteps, n_features), dtype=int))
    if len(X[i]) > n_credits:
        X[i] = X[i][:n_credits]

In [21]:
X = np.array(X)
y = np.array(y)
bincount_classes = np.bincount(y)
weights = {0: bincount_classes[0] / np.max(bincount_classes), 1: bincount_classes[1] / np.max(bincount_classes)}

In [None]:
#upsample
minority_class = X[y==1]
bc = np.bincount(y)
upsampled_minority_class = resample(minority_class, replace=True, n_samples=bc[0]-bc[1], random_state=123)
upsampled_X = np.append(X, upsampled_minority_class, axis=0)
upsampled_y = np.append(y, np.ones(bc[0]-bc[1], dtype=int))
weights = {0: 1, 1: 1}

In [22]:
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [29]:
batch_size = 64
epochs = 30

# FCN

In [None]:
model_fcn = Classifier_FCN2D(X[0].shape, nb_classes=2)
model_fcn.fit(X_train, y_train, X_val, y_val, batch_size=batch_size, epochs=epochs)

Model: "functional_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 80, 27)]          0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 80, 128)           27776     
_________________________________________________________________
batch_normalization_3 (Batch (None, 80, 128)           512       
_________________________________________________________________
activation_3 (Activation)    (None, 80, 128)           0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 80, 256)           164096    
_________________________________________________________________
batch_normalization_4 (Batch (None, 80, 256)           1024      
_________________________________________________________________
activation_4 (Activation)    (None, 80, 256)          

In [None]:
y_pred_fcn = model_fcn.predict(X_test)
roc_auc_score(y_test, y_pred_fcn)

# ResNet

In [None]:
model_resnet = Classifier_RESNET(X[0].shape, nb_classes=2, verbose=True)
model_resnet.fit(X_train, y_train, X_val, y_val)

In [None]:
y_pred_resnet = model_resnet.predict(X_test)
roc_auc_score(y_test, y_pred_resnet)