In [19]:
import numpy as np
import pandas as pd
import importlib
import sys
# import keras
from tqdm import tqdm
import pickle
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.optimizers import SGD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.utils import class_weight
from sklearn.utils import resample
from collections import defaultdict
from collections import Counter

from classifiers.cnn2d import Classifier_CNN2D
from classifiers.fcn2d import Classifier_FCN2D
from classifiers.resnet import Classifier_RESNET
importlib.reload(sys.modules[Classifier_CNN2D.__module__])
importlib.reload(sys.modules[Classifier_FCN2D.__module__])
importlib.reload(sys.modules[Classifier_RESNET.__module__])

<module 'classifiers.resnet' from '/home/aasleptsov98/Coursework/cnn_vs_rnn/classifiers/resnet.py'>

In [3]:
data_name = "balance_2d.data.pkl"

In [4]:
app_train = pd.read_csv("input/application_train.csv")
credit_balance = pd.read_csv("input/credit_card_balance.csv")

In [5]:
credit_balance = credit_balance.fillna(0)

In [6]:
ohe = preprocessing.OneHotEncoder()
categories = np.array(list(set(credit_balance["NAME_CONTRACT_STATUS"].astype(str).values))).reshape(-1,1)
ohe.fit(categories)

OneHotEncoder()

In [7]:
encoded = np.array(ohe.transform(credit_balance["NAME_CONTRACT_STATUS"].values.reshape(-1,1)).todense())
temp_df = pd.DataFrame(encoded, columns=range(encoded.shape[1]))

In [8]:
credit_balance = pd.concat([credit_balance.drop(["NAME_CONTRACT_STATUS"], axis=1), temp_df], axis=1)

In [9]:
n_timesteps = 80
n_features = credit_balance.shape[1] - 2
n_credits = 3

In [10]:
targets = dict()
for index, row in tqdm(app_train.iterrows()):
    targets[row["SK_ID_CURR"]] = row["TARGET"]

307511it [00:29, 10361.50it/s]


In [11]:
credits_X = defaultdict(list)
credits_y = defaultdict(int)

for id_cur, df_id_cur in tqdm(credit_balance.groupby(by=["SK_ID_PREV", "SK_ID_CURR"])):
    if len(df_id_cur) > 0 and id_cur[1] in targets:
#         X.append(df_id_cur.sort_values(by=['MONTHS_BALANCE'])[-n_timesteps:].drop(["SK_ID_PREV", "SK_ID_CURR"], axis=1).values)
#         y.append(targets[id_cur[1]])
        
        credits_X[id_cur[1]].append(df_id_cur.sort_values(by=['MONTHS_BALANCE'])[-n_timesteps:].drop(["SK_ID_PREV", "SK_ID_CURR"], axis=1).values)
        credits_y[id_cur[1]] = targets[id_cur[1]]

100%|██████████| 104307/104307 [02:07<00:00, 819.98it/s]


In [12]:
X = []
y = []

for key in credits_X.keys():
    X.append(credits_X[key])
    y.append(credits_y[key])

In [12]:
with open(data_name, "wb") as f:
    pickle.dump((X, y), f)

In [19]:
lens = []
for data in X:
    lens.append(len(data))
print(max(lens), min(lens), sum(lens) / len(lens))

80 1 34.376675204683714


In [13]:
for i in range(len(X)):
    for j in range(len(X[i])):
        X[i][j] = np.append(np.zeros((max(n_timesteps - X[i][j].shape[0], 0), X[i][j].shape[1]), dtype=int), X[i][j], axis=0)
    for j in range(n_credits - len(X[i])):
        X[i].append(np.zeros((n_timesteps, n_features), dtype=int))
    if len(X[i]) > n_credits:
        X[i] = X[i][:n_credits]

In [14]:
X = np.rollaxis(np.array(X), 1, 4)
y = np.array(y)
bincount_classes = np.bincount(y)
weights = {0: bincount_classes[0] / np.max(bincount_classes), 1: bincount_classes[1] / np.max(bincount_classes)}

In [None]:
#upsample
minority_class = X[y==1]
bc = np.bincount(y)
upsampled_minority_class = resample(minority_class, replace=True, n_samples=bc[0]-bc[1], random_state=123)
upsampled_X = np.append(X, upsampled_minority_class, axis=0)
upsampled_y = np.append(y, np.ones(bc[0]-bc[1], dtype=int))
weights = {0: 1, 1: 1}

In [205]:
np.rollaxis(X, 3, 2).shape

(86905, 3, 27, 80)

In [15]:
train_ratio = 0.8
validation_ratio = 0.1
test_ratio = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1 - train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))

In [16]:
batch_size = 64
epochs = 30

In [215]:
X[0].shape

(80, 27, 3)

In [None]:
model_cnn = Classifier_CNN2D(X[0].shape, nb_classes=2, verbose=True)
model_cnn.fit(X_train, y_train, X_val, y_val, batch_size=batch_size, epochs=epochs)

# FCN

In [None]:
model_fcn = Classifier_FCN2D(X[0].shape, nb_classes=2)
model_fcn.fit(X_train, y_train, X_val, y_val, batch_size=batch_size, epochs=epochs)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 80, 27, 3)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 80, 27, 128)       24704     
_________________________________________________________________
batch_normalization (BatchNo (None, 80, 27, 128)       512       
_________________________________________________________________
activation (Activation)      (None, 80, 27, 128)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 80, 27, 256)       819456    
_________________________________________________________________
batch_normalization_1 (Batch (None, 80, 27, 256)       1024      
_________________________________________________________________
activation_1 (Activation)    (None, 80, 27, 256)      

In [31]:
y_pred_fcn = model_fcn.predict(X_test)
roc_auc_score(y_test, y_pred_fcn) #0.6824 - imbalanced, 0.7854 - balanced (upsampled)

0.6727910449383746

# ResNet

In [None]:
model_resnet = Classifier_RESNET(X[0].shape, nb_classes=2, verbose=True)
model_resnet.fit(X_train, y_train, X_val, y_val)

In [None]:
y_pred_resnet = model_resnet.predict(X_test)
roc_auc_score(y_test, y_pred_resnet)