In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import math
import random
import sys
import os
import logging
import datetime

import tensorflow as tf
from tensorflow import keras

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [2]:
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [3]:
data_dir = "/ssd003/projects/pets/datasets/home_credit"
# home_credit_train_df = pd.read_csv(f"{data_dir}/home_credit_train.csv")
# credit_bureau_train_df = pd.read_csv(f"{data_dir}/credit_bureau_train.csv")
data_df = pd.read_csv(f"{data_dir}/train.csv")

In [4]:
## data set prep

In [5]:
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

In [6]:
train_labels = np.array(train_df.pop('target'))
val_labels = np.array(val_df.pop('target'))
test_labels = np.array(test_df.pop('target'))

train_features = np.array(train_df)
val_features = np.array(val_df)
test_features = np.array(test_df)

In [7]:
# normaliza data

In [12]:
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)

val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)

train_features = np.clip(train_features, -5, 5)
val_features = np.clip(val_features, -5, 5)
test_features = np.clip(test_features, -5, 5)

In [13]:
#oversampling
from imblearn.over_sampling import SMOTE
oversample = SMOTE()

X_train_oversampled, y_oversampled = oversample.fit_resample(train_features, train_labels)

In [32]:
resampled_features = X_train_oversampled
resampled_labels = y_oversampled

## data handling


In [42]:
TRAIN_PERC = .9
BATCH_SIZE = 1024
NUM_TEACHERS = 15
TEACHER_EPOCHS = 20
STUDENT_EPOCHS = 20
EPOCHS = 100

early_stopping = tf.keras.callbacks.EarlyStopping(
#     monitor='val_prc', 
    monitor="val_auc",
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [43]:
def get_loaders(data, num_teachers, batch_size):
    """ 
    Function to create data loaders for the Teacher Class.
    
    :param data: Numpy Array of the data 
    :param num_teacher: Number of teacher models 
    :param batch_size: Batch size for the dataloaders

    :return: Return teacher loaders and student loader (with actual labels)
    """ 
    loaders = []
    sample_size = len(data) // (num_teachers + 1)

    for i in range(num_teachers):
        indices = list(range(i*sample_size, (i+1)*sample_size))
        #subset_data = Subset(data, indices)
        subset_data = data[indices[0]:indices[-1]]
        #loader = DataLoader(subset_data, batch_size=batch_size)
        loaders.append(subset_data)
    
    return loaders[1:], loaders[0] #teacher, students

In [44]:
# Define dataloaders
t_loaders, s_loader = get_loaders(train_features, NUM_TEACHERS, BATCH_SIZE) # Teacher loaders, student loader
# val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, drop_last=True) # Loader to validate in Train Ensemble and Train Student Model

In [45]:
METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

def make_model(metrics=METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    # underfitting. 
    model = keras.Sequential([
      keras.layers.Dense(
          16, activation='relu',input_shape=(train_features.shape[-1],)),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(1, activation='sigmoid',
                         bias_initializer=output_bias),
    ])
    model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=1e-3),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

    return model

In [46]:
# Initialize models and otptimizers for teacher ensembles
models = [make_model() for i in range(NUM_TEACHERS)]
# opts = [torch.optim.Adam(model.parameters(), lr=.001,  betas=(0.9, 0.999)) for model in models]


In [47]:
def train_model(resampled_model):
    resampled_model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss=keras.losses.BinaryCrossentropy(),
        metrics=METRICS)

    # Reset the bias to zero, since this dataset is balanced.
    output_layer = resampled_model.layers[-1] 
    output_layer.bias.assign([0])
    # resampled_steps_per_epoch = 20

    resampled_history = resampled_model.fit(
        resampled_features, resampled_labels,
        epochs=EPOCHS,
    #     steps_per_epoch=resampled_steps_per_epoch,
        callbacks=[early_stopping],
        validation_data=(val_features, val_labels))

In [None]:
# models[0].compile(
#         optimizer=keras.optimizers.Adam(learning_rate=1e-3),
#         loss=keras.losses.BinaryCrossentropy(),
#         metrics=METRICS)
# output_layer = models[0].layers[-1] 
# output_layer.bias.assign([0])
# # resampled_steps_per_epoch = 20

# resampled_history = models[0].fit(
#     resampled_features, resampled_labels,
#     epochs=EPOCHS,
# #     steps_per_epoch=resampled_steps_per_epoch,
#     callbacks=[early_stopping],
#     validation_data=(val_features, val_labels))