In [None]:
import os
import sys
import mlflow
import warnings
import time
import json
import shap
import gc
import subprocess
import tensorflow

import pandas as pd
import seaborn as sns
import numpy as np

from hyperopt import fmin, tpe, hp, Trials, space_eval
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

sys.path.append("../")

from models.scorer import home_credit_scoring_fn, home_credit_scorer, home_credit_fn_keras

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100

mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI"))


In [None]:
try:
    mlflow.create_experiment(name="home_credit_model")
except mlflow.MlflowException:
    mlflow.set_experiment(experiment_name="home_credit_model")


In [None]:
df_model = pd.read_pickle(filepath_or_buffer="../data/df_hc_nm_imputed.pkl")


# Light preprocessing

In [None]:
# we need to redefine binary cols as variables that use 0, 1 or -1 (sentinel) :

def detect_binary_cols_with_sentinel(dataframe: pd.DataFrame):
    """
    Detects binary columns in a pandas dataframe.

    Args:
    - df: pandas dataframe.

    Returns:
    - list of binary column names.
    """

    binary_cols = []

    for col in dataframe.columns:
        unique_vals = dataframe[col].dropna().unique()
        if len(unique_vals) == 2 and set(unique_vals) == {0, 1}:
            binary_cols.append(col)

    return binary_cols


## Identifying the binary cols (0 , 1, -1 (sentinel)) : not to scale

In [None]:
binary_cols = detect_binary_cols_with_sentinel(dataframe=df_model)


In [None]:
numeric_to_scale = [col for col in df_model.columns if col not in binary_cols]


## Scaling the numeric columns for Keras

In [None]:
scaler = StandardScaler()


In [None]:
scaler = StandardScaler()
df_model[numeric_to_scale] = scaler.fit_transform(df_model[numeric_to_scale])


In [None]:
df_model.head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df_model.drop(columns=["TARGET"]),
    df_model["TARGET"],
    test_size=0.3,
    random_state=123,
    stratify=df_model["TARGET"]
    )

X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.25,
    random_state=123,
    stratify=y_train
    )


# DNN : 

- input of size feature len
- 512 dense
- dropout 30%
- 256 dense
- dropout 20%
- Sigmoid for binary clf


## Create the DNN

In [None]:
# Sequential model :
model = tensorflow.keras.Sequential()
model.add(tensorflow.keras.layers.Dense(512, activation="relu", input_shape=(X_train.shape[1],)))
model.add(tensorflow.keras.layers.Dropout(0.3))
model.add(tensorflow.keras.layers.Dense(256, activation="relu"))
model.add(tensorflow.keras.layers.Dropout(0.2))
model.add(tensorflow.keras.layers.Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=[tensorflow.keras.metrics.Recall()])


In [None]:
model.summary()


## Run the DNN

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=64,
    validation_data=(X_val, y_val)
    )
