In [61]:
import numpy as np
import pandas as pd

# Data splitting modules
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# Model building modules
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier, Pool, cv

# Hyperparameter tuning modules
from sklearn.model_selection import GridSearchCV

# Model evaluation modules
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Data sampling modules
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

# Model saving module
import pickle

In [2]:
# Setting random num
RANDOM_NUM = 42

In [3]:
# Loading train and test data
train = pd.read_csv('data/processed/processed_train_data.csv')
test = pd.read_csv('data/processed/processed_test_data.csv')

X_train = train.drop(['is_fraud'], axis=1)
y_train = train['is_fraud']

X_test = test.drop(['is_fraud'], axis=1)
y_test = test['is_fraud']

## 1 Logistic Regression

### 1.1 Baseline model - Imbalanced data

In [4]:
# Baseline model
model_name = 'Baseline - LogisticRegression / Imbalanced data'
lr = LogisticRegression(random_state=RANDOM_NUM, max_iter=1000)

lr_model = lr.fit(X_train, y_train)
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

In [5]:
# Getting model evalution statistics
model_evalution_df = pd.DataFrame(columns=['Model Name', 'Training Score', 'Testing Score', 'Accuracy', 'F1 Score', 'Precision', 'Recall'])

def add_model_evalution_stat(model_name, model, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred, model_evalution_df):
    """Function for adding model evalution statistics to DataFrame

    Args:
        model_name (str): Model name
        model (object): Object of fitted model
        X_train (DataFrame): Train dataset
        X_test (DataFrame): Test dataset
        y_train (DataFrame): Train labels
        y_test (DataFrame): Test labels
        y_train_pred (DataFrame): Predicted train labels
        y_test_pred (DataFrame): Predicted test labels
        model_evalution_df (DataFrame): DataFrame with evalution statistics

    Returns:
        DataFrame: DataFrame with added statistics
    """
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    accuracy = metrics.accuracy_score(y_test, y_test_pred)
    f1_score = metrics.f1_score(y_test, y_test_pred, average='weighted')
    precision = metrics.precision_score(y_test, y_test_pred)
    recall = metrics.recall_score(y_test, y_test_pred)
    model_evalutions_stats = [model_name, train_score, test_score, accuracy, f1_score, precision, recall]
    model_evalution_dict = {model_evalution_df.columns[i]:model_evalutions_stats[i] for i in range(len(model_evalutions_stats))}
    model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)
    return model_evalution_df

In [6]:
# Adding baseline model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, lr_model, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [7]:
# Display model evalution statistics data frame
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566


### 1.2 Random Under Sampling data

In [8]:
# Random Under Sampling data implimentation
random_under_sampler = RandomUnderSampler(random_state=RANDOM_NUM)
X_rus, y_rus = random_under_sampler.fit_resample(X_train, y_train)

y_rus.value_counts()

0.0    7506
1.0    7506
Name: is_fraud, dtype: int64

In [9]:
# Train-Test splitting
X_train_u_sampled, X_test_u_sampled, y_train_u_sampled, y_test_u_sampled = train_test_split(X_rus, y_rus, test_size=0.25, random_state=RANDOM_NUM, stratify=y_rus)
y_train_u_sampled.value_counts()

1.0    5630
0.0    5629
Name: is_fraud, dtype: int64

In [10]:
# Model with random under sampled data
model_name = 'LogisticRegression / Random Under Sampled data'
lr = LogisticRegression(random_state=RANDOM_NUM, max_iter=1000)

lr_model_u_sampled = lr.fit(X_train_u_sampled, y_train_u_sampled)
y_train_pred = lr_model_u_sampled.predict(X_train_u_sampled)
y_test_pred = lr_model_u_sampled.predict(X_test)

In [11]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, lr_model_u_sampled, X_train_u_sampled, X_test, y_train_u_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [12]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063


### 1.3 Random Over Sampling data

In [13]:
random_over_sampler = RandomOverSampler(random_state=RANDOM_NUM)
X_ros, y_ros = random_over_sampler.fit_resample(X_train, y_train)

y_ros.value_counts()

0.0    1287873
1.0    1287873
Name: is_fraud, dtype: int64

In [14]:
# Train-Test splitting
X_train_o_sampled, X_test_o_sampled, y_train_o_sampled, y_test_o_sampled = train_test_split(X_ros, y_ros, test_size=0.25, random_state=RANDOM_NUM, stratify=y_ros)
y_train_o_sampled.value_counts()

1.0    965905
0.0    965904
Name: is_fraud, dtype: int64

In [15]:
# Model with random over sampled data
model_name = 'LogisticRegression / Random Over Sampled data'
lr = LogisticRegression(random_state=RANDOM_NUM, max_iter=1000)

lr_model_o_sampled = lr.fit(X_train_o_sampled, y_train_o_sampled)
y_train_pred = lr_model_o_sampled.predict(X_train_o_sampled)
y_test_pred = lr_model_o_sampled.predict(X_test)

In [16]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, lr_model_o_sampled, X_train_o_sampled, X_test, y_train_o_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [17]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259


### 1.4 SMOTE Over Sampling data

In [18]:
smote_sampler = SMOTE(random_state=RANDOM_NUM)
X_sm, y_sm = smote_sampler.fit_resample(X_train.astype('float'), y_train)

y_sm.value_counts()

0.0    1287873
1.0    1287873
Name: is_fraud, dtype: int64

In [19]:
X_train_s_sampled, X_test_s_sampled, y_train_s_sampled, y_test_s_sampled = train_test_split(X_sm, y_sm, test_size=0.25, random_state=RANDOM_NUM, stratify=y_sm)
y_train_s_sampled.value_counts()

1.0    965905
0.0    965904
Name: is_fraud, dtype: int64

In [20]:
# Model with SMOTE over sampled data
model_name = 'LogisticRegression / SMOTE Over Sampled data'
lr = LogisticRegression(random_state=RANDOM_NUM, max_iter=1000)

lr_model_s_sampled = lr.fit(X_train_s_sampled, y_train_s_sampled)
y_train_pred = lr_model_s_sampled.predict(X_train_s_sampled)
y_test_pred = lr_model_s_sampled.predict(X_test)

In [21]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, lr_model_s_sampled, X_train_s_sampled, X_test, y_train_s_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [22]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986


### 1.5 Random Under Sampling data with optimized parameters

In [23]:
# Preparing Grid Search 
param_grid = [
              {'penalty': ['l2', 'none'], 
              'solver': ['lbfgs', 'sag'],
               'C': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1]},
              
              {'penalty': ['l1', 'l2'] ,
              'solver': ['liblinear', 'saga'],
               'C': [0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 1]}
]

grid_search_lr = GridSearchCV(
    estimator=LogisticRegression(random_state=RANDOM_NUM, max_iter=1000), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1,
    scoring='recall'
)

model_name = 'Tuned LogisticRegression / Random Under Sampled data'
best_params_lr = grid_search_lr.fit(X_train_u_sampled, y_train_u_sampled)

y_train_pred = best_params_lr.best_estimator_.predict(X_train_u_sampled)
y_test_pred = best_params_lr.best_estimator_.predict(X_test)

In [24]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, best_params_lr.best_estimator_, X_train_u_sampled, X_test, y_train_u_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [25]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319


## 2 Decision Tree

### 2.1 Decision Tree - Imbalanced data

In [26]:
# Decision Tree model with imbalanced data
model_name = 'DecisionTreeClassifier / Imbalanced data'
dt = DecisionTreeClassifier(
    max_depth=10,
    random_state=RANDOM_NUM
)

dt_model = dt.fit(X_train, y_train)
y_train_pred = dt_model.predict(X_train)
y_test_pred = dt_model.predict(X_test)

In [27]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, dt_model, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [28]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117


### 2.2 Decision Tree - Random Under Sampling data

In [29]:
# Decision Tree model with Random Under Sampled data
model_name = 'DecisionTreeClassifier / Random Under Sampled data'
dt = DecisionTreeClassifier(
    max_depth=10,
    random_state=RANDOM_NUM
)

dt_model_u_sampled = dt.fit(X_train_u_sampled, y_train_u_sampled)
y_train_pred = dt_model_u_sampled.predict(X_train_u_sampled)
y_test_pred = dt_model_u_sampled.predict(X_test)

In [30]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, dt_model_u_sampled, X_train_u_sampled, X_test, y_train_u_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [31]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224


### 2.3 Decision Tree - Random Over Sampling data

In [32]:
# Decision Tree model with Random Over Sampled data
model_name = 'DecisionTreeClassifier / Random Over Sampled data'
dt = DecisionTreeClassifier(
    max_depth=10,
    random_state=RANDOM_NUM
)

dt_model_o_sampled = dt.fit(X_train_o_sampled, y_train_o_sampled)
y_train_pred = dt_model_o_sampled.predict(X_train_o_sampled)
y_test_pred = dt_model_o_sampled.predict(X_test)

In [33]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, dt_model_o_sampled, X_train_o_sampled, X_test, y_train_o_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [34]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907


### 2.3 Decision Tree - SMOTE Over Sampling data

In [35]:
# Decision Tree model with SMOTE Over Sampled data
model_name = 'DecisionTreeClassifier / SMOTE Over Sampled data'
dt = DecisionTreeClassifier(
    max_depth=10,
    random_state=RANDOM_NUM
)

dt_model_s_sampled = dt.fit(X_train_s_sampled, y_train_s_sampled)
y_train_pred = dt_model_s_sampled.predict(X_train_s_sampled)
y_test_pred = dt_model_s_sampled.predict(X_test)

In [36]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, dt_model_s_sampled, X_train_s_sampled, X_test, y_train_s_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [37]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907
8,DecisionTreeClassifier / SMOTE Over Sampled data,0.97509,0.96506,0.96506,0.979032,0.095504,0.950583


### 2.5 Decision Tree - Imbala Random Under sampled data - Optimized Hyperparameters

In [38]:
# Max depth of tree
max_depth = list(np.linspace(start=4, stop=20, num=5, dtype=int))
# Number of samples to split a node
min_samples_split = list(np.linspace(start=2, stop=12, num=6, dtype=int))
# Number of samplet at leaf node
min_samples_leaf = list(np.linspace(start=2, stop=12, num=6, dtype=int))
# Type of criterion
criterion = ['gini', 'entropy']


In [39]:
# Preparing Grid Search
param_grid = {
    'max_depth':max_depth,
    'min_samples_leaf':min_samples_leaf,
    'min_samples_split':min_samples_split,
    'criterion':criterion
}

grid_search_tree = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=RANDOM_NUM), 
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1,
    scoring='recall'
)

model_name = 'Tuned DecisionTreeClassifier / Random Under sampled data'
best_params_dt = grid_search_tree.fit(X_train_u_sampled, y_train_u_sampled)

y_train_pred = best_params_dt.best_estimator_.predict(X_train_u_sampled)
y_test_pred = best_params_dt.best_estimator_.predict(X_test)

In [40]:
best_params_dt.best_estimator_

In [41]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, best_params_dt.best_estimator_, X_train_u_sampled, X_test, y_train_u_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [42]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907
8,DecisionTreeClassifier / SMOTE Over Sampled data,0.97509,0.96506,0.96506,0.979032,0.095504,0.950583
9,Tuned DecisionTreeClassifier / Random Under sa...,0.983391,0.966947,0.966947,0.980048,0.101694,0.965501


## 3 Random Forest

### 3.1 Random Forest - Imbalanced data

In [43]:
# Random Forest model with imbalanced data
model_name = 'RandomForestClassifier / Imbalanced data'
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=RANDOM_NUM
)

rf_model = rf.fit(X_train, y_train)
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

In [44]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, rf_model, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [45]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907
8,DecisionTreeClassifier / SMOTE Over Sampled data,0.97509,0.96506,0.96506,0.979032,0.095504,0.950583
9,Tuned DecisionTreeClassifier / Random Under sa...,0.983391,0.966947,0.966947,0.980048,0.101694,0.965501


### 3.2 Random Forest - Random Under Sampling data

In [46]:
# Random Forest model with Random Under Sampling data
model_name = 'RandomForestClassifier / Random Under Sampling data'
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=RANDOM_NUM
)

rf_u_model = rf.fit(X_train_u_sampled, y_train_u_sampled)
y_train_pred = rf_u_model.predict(X_train_u_sampled)
y_test_pred = rf_u_model.predict(X_test)

In [47]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, rf_u_model, X_train_u_sampled, X_test, y_train_u_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [48]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907
8,DecisionTreeClassifier / SMOTE Over Sampled data,0.97509,0.96506,0.96506,0.979032,0.095504,0.950583
9,Tuned DecisionTreeClassifier / Random Under sa...,0.983391,0.966947,0.966947,0.980048,0.101694,0.965501


### 3.3 Random Forest - Random Over Sampling data

In [49]:
# Random Forest model with Random Over Sampling data
model_name = 'RandomForestClassifier / Random Over Sampling data'
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=RANDOM_NUM
)

rf_o_model = rf.fit(X_train_o_sampled, y_train_o_sampled)
y_train_pred = rf_o_model.predict(X_train_o_sampled)
y_test_pred = rf_o_model.predict(X_test)

In [50]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, rf_o_model, X_train_o_sampled, X_test, y_train_o_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [51]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907
8,DecisionTreeClassifier / SMOTE Over Sampled data,0.97509,0.96506,0.96506,0.979032,0.095504,0.950583
9,Tuned DecisionTreeClassifier / Random Under sa...,0.983391,0.966947,0.966947,0.980048,0.101694,0.965501


### 3.4 Random Forest - SMOTE Over Sampling data

In [52]:
# Random Forest model with Random Over Sampling data
model_name = 'RandomForestClassifier / SMOTE Over Sampling data'
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=RANDOM_NUM
)

rf_s_model = rf.fit(X_train_s_sampled, y_train_s_sampled)
y_train_pred = rf_s_model.predict(X_train_s_sampled)
y_test_pred = rf_s_model.predict(X_test)

In [53]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, rf_s_model, X_train_s_sampled, X_test, y_train_s_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [54]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907
8,DecisionTreeClassifier / SMOTE Over Sampled data,0.97509,0.96506,0.96506,0.979032,0.095504,0.950583
9,Tuned DecisionTreeClassifier / Random Under sa...,0.983391,0.966947,0.966947,0.980048,0.101694,0.965501


### 3.5 Random Forest - Random Under Sampling data - Tuned hyperparameters

In [55]:
# Number of trees in Random Forest
n_estimators = list(np.linspace(start=100, stop=300, num=5, dtype=int))
# Max depth of trees in Random Forest
max_depth = list(np.linspace(start=4, stop=20, num=5, dtype=int))
# Number of samples to split a node
min_samples_split = list(np.linspace(start=2, stop=12, num=6, dtype=int))
# Number of samplet at leaf node
min_samples_leaf = list(np.linspace(start=2, stop=12, num=6, dtype=int))
# Type of criterion
criterion = ['gini', 'entropy']

param_grid = {
    'n_estimators':n_estimators,
    'max_depth':max_depth,
    'min_samples_leaf':min_samples_leaf,
    'min_samples_split':min_samples_split,
    'criterion':criterion
}

In [56]:
grid_search_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=RANDOM_NUM),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='recall'
)


model_name = 'Tuned RandomForestClassifier / Random Under Sampled data'
best_params_rf = grid_search_rf.fit(X_train_u_sampled, y_train_u_sampled)

y_train_pred = best_params_rf.best_estimator_.predict(X_train_u_sampled)
y_test_pred = best_params_rf.best_estimator_.predict(X_test)


In [57]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, best_params_rf.best_estimator_, X_train_u_sampled, X_test, y_train_u_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [58]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907
8,DecisionTreeClassifier / SMOTE Over Sampled data,0.97509,0.96506,0.96506,0.979032,0.095504,0.950583
9,Tuned DecisionTreeClassifier / Random Under sa...,0.983391,0.966947,0.966947,0.980048,0.101694,0.965501


## 4 CatBoost 

### 4.1 CatBoost - Random Under Sampling data

In [59]:
# Preparing dataset
pool = Pool(data=X_train_u_sampled, label=y_train_u_sampled)

In [62]:
# cv
params = {
    'loss_function': 'Logloss',
    'iterations': 300,
    'custom_loss': 'Recall',
    'random_seed': RANDOM_NUM,
    'learning_rate': 0.15
}

cv_data = cv(
    params=params,
    pool=Pool(data=X_train_u_sampled, label=y_train_u_sampled),
    fold_count=5, # separating to 5 folds
    shuffle=True,
    partition_random_seed=RANDOM_NUM,
    stratified=True, 
    verbose=False
)

# Best score printing
best_value = np.min(cv_data['test-Logloss-mean'])
best_iter = np.argmin(cv_data['test-Logloss-mean'])
print("Best validation Logloss score, stratified: {:.4f}+/-{:.3f} on step {}".format(best_value, cv_data['test-Logloss-std'][best_iter], best_iter))

Training on fold [0/5]

bestTest = 0.07199108017
bestIteration = 180

Training on fold [1/5]

bestTest = 0.06232231934
bestIteration = 176

Training on fold [2/5]

bestTest = 0.05760073377
bestIteration = 159

Training on fold [3/5]

bestTest = 0.06325512234
bestIteration = 160

Training on fold [4/5]

bestTest = 0.06663855236
bestIteration = 142

Best validation Logloss score, stratified: 0.0648+/-0.005 on step 163


In [63]:
# Creating model object
cb_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.15,
    eval_metric='Recall'
)

model_name = 'CatBoostClassifier / Random Under Sampled data'
cb_model.fit(X_train_u_sampled, y_train_u_sampled,
         eval_set=(X_test, y_test),
         verbose=50,
         early_stopping_rounds=20,
)

y_train_pred = cb_model.predict(X_train_u_sampled)
y_test_pred = cb_model.predict(X_test)

0:	learn: 0.9085258	test: 0.8941725	best: 0.8941725 (0)	total: 20.7ms	remaining: 6.2s
50:	learn: 0.9788632	test: 0.9650350	best: 0.9655012 (47)	total: 1.03s	remaining: 5.01s
100:	learn: 0.9882771	test: 0.9724942	best: 0.9724942 (99)	total: 1.95s	remaining: 3.84s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9752913753
bestIteration = 107

Shrink model to first 108 iterations.


In [64]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, cb_model, X_train_u_sampled, X_test, y_train_u_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [65]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907
8,DecisionTreeClassifier / SMOTE Over Sampled data,0.97509,0.96506,0.96506,0.979032,0.095504,0.950583
9,Tuned DecisionTreeClassifier / Random Under sa...,0.983391,0.966947,0.966947,0.980048,0.101694,0.965501


### 4.2 CatBoost - Random Over Sampling data

In [66]:
# Creating model object
cb_o_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.15,
    eval_metric='Recall'
)

model_name = 'CatBoostClassifier / Random Over Sampled data'
cb_o_model.fit(X_train_o_sampled, y_train_o_sampled,
         eval_set=(X_test, y_test),
         verbose=50,
         early_stopping_rounds=20,
)

y_train_pred = cb_o_model.predict(X_train_o_sampled)
y_test_pred = cb_o_model.predict(X_test)

0:	learn: 0.9083202	test: 0.8974359	best: 0.8974359 (0)	total: 373ms	remaining: 1m 51s
50:	learn: 0.9785279	test: 0.9659674	best: 0.9659674 (49)	total: 18.3s	remaining: 1m 29s
100:	learn: 0.9898323	test: 0.9678322	best: 0.9701632 (88)	total: 36s	remaining: 1m 10s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9701631702
bestIteration = 88

Shrink model to first 89 iterations.


In [67]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, cb_o_model, X_train_o_sampled, X_test, y_train_o_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


### 4.3 CatBoost - SMOTE Over Sampling data

In [68]:
# Creating model object
cb_s_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.15,
    eval_metric='Recall'
)

model_name = 'CatBoostClassifier / SMOTE Over Sampled data'
cb_s_model.fit(X_train_s_sampled, y_train_s_sampled,
         eval_set=(X_test, y_test),
         verbose=50,
         early_stopping_rounds=20,
)

y_train_pred = cb_s_model.predict(X_train_s_sampled)
y_test_pred = cb_s_model.predict(X_test)

0:	learn: 0.9031468	test: 0.8871795	best: 0.8871795 (0)	total: 441ms	remaining: 2m 11s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9389277389
bestIteration = 12

Shrink model to first 13 iterations.


In [69]:
# Adding model evalution statistics
model_evalution_df = add_model_evalution_stat(model_name, cb_s_model, X_train_s_sampled, X_test, y_train_s_sampled, y_test, y_train_pred, y_test_pred, model_evalution_df)

  model_evalution_df = model_evalution_df.append(model_evalution_dict, ignore_index=True)


In [70]:
model_evalution_df

Unnamed: 0,Model Name,Training Score,Testing Score,Accuracy,F1 Score,Precision,Recall
0,Baseline - LogisticRegression / Imbalanced data,0.994677,0.995019,0.995019,0.994677,0.308308,0.233566
1,LogisticRegression / Random Under Sampled data,0.817657,0.919994,0.919994,0.954764,0.034764,0.737063
2,LogisticRegression / Random Over Sampled data,0.819278,0.916694,0.916694,0.952963,0.033594,0.741259
3,LogisticRegression / SMOTE Over Sampled data,0.818825,0.920573,0.920573,0.95508,0.035135,0.73986
4,Tuned LogisticRegression / Random Under Sample...,0.854428,0.954815,0.954815,0.973484,0.061251,0.747319
5,DecisionTreeClassifier / Imbalanced data,0.998505,0.998544,0.998544,0.998497,0.854942,0.750117
6,DecisionTreeClassifier / Random Under Sampled ...,0.981259,0.956838,0.956838,0.974657,0.080449,0.976224
7,DecisionTreeClassifier / Random Over Sampled data,0.977335,0.964095,0.964095,0.978521,0.093907,0.959907
8,DecisionTreeClassifier / SMOTE Over Sampled data,0.97509,0.96506,0.96506,0.979032,0.095504,0.950583
9,Tuned DecisionTreeClassifier / Random Under sa...,0.983391,0.966947,0.966947,0.980048,0.101694,0.965501


In [71]:
# Saving fitted model (CatBoostClassifier / Random Under Sampled data)
with open('models/model.pkl', 'wb') as output:
    pickle.dump(cb_model, output)

# Conclusion

В ходе построения модели классификации было построено три типа моделей: логистическая регрессия, дерево решений и случайный лес. Кроме того, в целях исключения негативного влияния на результат моделей было реализовано три метода борьбы с дисбалансом данных: Random Under Sampling, Random Over Sampling, SMOTE Over Sampling. Также была построена модель классификации с помощью библиотеки CatBoost. 

Таким образом, было построено 18 различных моделей. Из всех построенных моделей по показателю целевой метрики (Recall) наилучшими являются "DecisionTreeClassifier / Random Under Sampled" и "CatBoostClassifier / Random Under Sampled data". 
В качестве итоговой модели выбрана модель "CatBoostClassifier / Random Under Sampled data", показатель Recall для которой составляет 0.975, а показатель F1-score составляет 0.984.