In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

from ML.credit_risk.pipeline.data_operations import *
from ML.credit_risk.pipeline.read_data import read_data_train, read_data_test
from ML.credit_risk.pipeline import make_prepare_pipeline, make_train_pipeline, get_predict_pipeline

random_seed = 42

In [3]:
data_original = read_data_train()

In [4]:
train_pipeline = make_train_pipeline()
data_train = train_pipeline.fit_transform(data_original.copy(deep=True))

In [12]:
from sklearn.linear_model import LogisticRegressionCV
credit_default_model2 = LogisticRegressionCV()

In [13]:
from sklearn.naive_bayes import ComplementNB
credit_default_model3 = ComplementNB()

In [5]:
from sklearn.ensemble import RandomForestClassifier
credit_default_model4 = RandomForestClassifier(max_depth=5, n_estimators=100)

In [10]:
y_columns = ["TARGET"]

def train_model(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    show_f1_score(model, x_test, y_test)

def keras_train_model(model, x_train, x_test, y_train, y_test, batch_size=500, epochs=10):
    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)
    show_keras_history(history)
    show_f1_score(model, x_test, y_test)

def show_keras_history(history):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.show()

In [15]:
train_model(credit_default_model2, *split_train_data(data_train, y_columns, random_seed))

  return f(*args, **kwargs)
true positive: 0
true negative: 56538
false positive: 0
false negative: 4965
f1 score: 0


In [16]:
train_model(credit_default_model3, *split_train_data(data_train, y_columns, random_seed))

true positive: 2669
true negative: 26261
false positive: 30277
false negative: 2296
f1 score: 0.14080346073698924
  return f(*args, **kwargs)


In [17]:
train_model(credit_default_model4, *split_train_data(data_train, y_columns, random_seed))

  model.fit(x_train, y_train)
true positive: 0
true negative: 56538
false positive: 0
false negative: 4965
f1 score: 0


In [11]:
from ML.credit_risk.pipeline import UpsampleByTarget

x_tr, x_ts, y_tr, y_ts = split_train_data(data_train, y_columns, random_seed)
data_upsampled = UpsampleByTarget(y_columns[0]).transform(pd.concat([x_tr, y_tr], axis="columns"))

x_tr_upsampled, y_tr_upsampled = get_data_xy(data_upsampled, y_columns)

In [19]:
train_model(credit_default_model2, x_tr_upsampled, x_ts, y_tr_upsampled, y_ts)

  return f(*args, **kwargs)
true positive: 415
true negative: 51202
false positive: 5336
false negative: 4550
f1 score: 0.07745427398282942


In [20]:
train_model(credit_default_model3, x_tr_upsampled, x_ts, y_tr_upsampled, y_ts)

true positive: 2659
true negative: 26365
false positive: 30173
false negative: 2306
f1 score: 0.14069899727491603
  return f(*args, **kwargs)


In [12]:
train_model(credit_default_model4, x_tr_upsampled, x_ts, y_tr_upsampled, y_ts)

  model.fit(x_train, y_train)
true positive: 3403
true negative: 36913
false positive: 19625
false negative: 1562
f1 score: 0.24313221162433465


In [22]:
show_f1_score(credit_default_model4, *get_data_xy(data_train, y_columns))

true positive: 15706
true negative: 160328
false positive: 122358
false negative: 9119
f1 score: 0.19284297896113306


In [23]:
cross_validate(credit_default_model2, x_tr_upsampled, y_tr_upsampled, random_state=random_seed)

(0.5145010175607085,
 array([0.51912463, 0.51861958, 0.51664599, 0.51415799, 0.51248287,
        0.51147771, 0.50977293, 0.5111657 , 0.51319222, 0.51837056]))

In [24]:
cross_validate(credit_default_model3, x_tr_upsampled, y_tr_upsampled, random_state=random_seed)

(0.5059378477229585,
 array([0.50931625, 0.50850807, 0.50911728, 0.50820081, 0.500283  ,
        0.50281677, 0.50140446, 0.50527104, 0.50532198, 0.50913883]))

In [9]:
cross_validate(credit_default_model4, x_tr_upsampled, y_tr_upsampled, random_state=random_seed)

In [13]:
import pickle
from pathlib import Path

model_path = Path("ML/credit_risk/random_forest.model")

In [15]:
pickle.dump(credit_default_model4, model_path.open("wb"))