# Code

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from time import time
import warnings

warnings.filterwarnings('ignore')

In [2]:
def eda(X_train, X_test):
    # So there are no problems with indexes during cross-validation
    X_train.reset_index(drop=True, inplace=True)
    X_test.reset_index(drop=True, inplace=True)
    # Feature extraction
    numerical_features = ['Age', 'Debt', 'YearsEmployed', 'ZipCode', 'Income']
    categorical_features = [
        'Gender', 'Married', 'BankCustomer', 'Industry', 'Ethnicity',
        'PriorDefault', 'Employed', 'CreditScore', 'DriversLicense', 'Citizen'
    ]
    # Logarithmic transformation of a highly skewed variables into a more normalized view
    X_train[numerical_features] = np.log(X_train[numerical_features].replace(0, np.nan)).fillna(0)
    X_test[numerical_features] = np.log(X_test[numerical_features].replace(0, np.nan)).fillna(0)
    return X_train, X_test, numerical_features, categorical_features

In [89]:
def preprocessing(X_train, X_test, numerical_features, categorical_features):
    # Features Encoding
    # Some models are sensitive to scaling of numerical features
    numerical_transformer = Pipeline(steps=[('scalar', StandardScaler())])
    categorical_transformer = Pipeline(steps=[(
        'onehot', OneHotEncoder(drop='if_binary', sparse=False, handle_unknown='ignore')
    )])

    ct = ColumnTransformer([
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ], remainder='passthrough')

    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)
    return X_train, X_test

In [98]:
def model_selection(X, y):
    # Starter set of classifiers
    classifiers = {
        'KNN': KNeighborsClassifier(),
        'LogReg': LogisticRegression(),
        'SVC': SVC(),
        'RandomForest': RandomForestClassifier(),
        'LGBM': LGBMClassifier(),
        'CatBoost': CatBoostClassifier()
    }
    # Parameters of models for grid search
    KNN_grid = {
        "n_neighbors": list(range(5, 30+1, 5)),
        "weights": ['uniform', 'distance'],
        "metric": ['euclidian', 'manhattan', 'minkowski']
    }
    LogReg_grid = {
        "penalty": ['l1', 'l2'],
        "C": [0.001, 0.1, 1, 5, 20],
        "random_state": [0]
    }
    SVC_grid = {
        "kernel": ['poly', 'rbf', 'sigmoid'],
        "C": [0.001, 0.1, 1, 5, 20],
        "random_state": [0],
        "probability": [True]
    }
    RandomForest_grid = {
        "n_estimators": [50, 100, 500, 1000],
        "max_features": ['sqrt', 'log2'],
        "random_state": [0]
    }
    LGBM_grid = {
        "n_estimators": [50, 100, 500, 1000],
        "max_depth": [3, 6, 9],
        "learning_rate": [0.001, 0.01, 0.1, 1],
        "random_state": [0]
    }
    CatBoost_grid = {
        "n_estimators": [50, 100, 500],
        "max_depth": [3, 6, 9],
        "learning_rate": [0.01, 0.1, 1],
        "random_state": [0],
        "verbose": [False]
    }

    grid = {
        'KNN': KNN_grid,
        'LogReg': LogReg_grid,
        'SVC': SVC_grid,
        'RandomForest': RandomForest_grid,
        'LGBM': LGBM_grid,
        'CatBoost': CatBoost_grid
    }
    # Models fitting without cross-validation
    y.reset_index(drop=True, inplace=True)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)
    score = classifiers.copy()
    best_params = classifiers.copy()
    print("MODEL SELECTION")
    for i, (key, classifier) in enumerate(classifiers.items()):
        start = time()
        clf = GridSearchCV(classifier, param_grid=grid[key], n_jobs=-1, cv=None)
        clf.fit(X_train, y_train)
        stop = time()
        score[key] = clf.score(X_valid, y_valid)
        best_params[key] = clf.best_params_
        print("Model:", key)
        print("Score:", score[key])
        print("Training time (mins):", np.round((stop-start)/60, decimals=2))

    key_max_score = max(score, key=score.get)
    # Choice of best models
    best_classifiers = {
        key:classifiers[key].set_params(**best_params[key]) for key, value in score.items() if score[key_max_score] - value < 0.015
    }
    print("Best models:", end=' ')
    print(*best_classifiers.keys(), sep=', ')
    return best_classifiers

In [91]:
def predictions_tuning(y_test, predictions):
    # The Geometric Mean to find optimal threshold
    fpr, tpr, thresholds = roc_curve(y_test, predictions)
    gmeans = np.sqrt(tpr * (1 - fpr))
    ix = np.argmax(gmeans)
    best_threshold = thresholds[ix]
    predictions[np.where(predictions >= best_threshold)] = 1
    predictions[np.where(predictions < best_threshold)] = 0
    print("Accuracy after tuning:", accuracy_score(y_test, predictions))
    return predictions

In [99]:
def prediction(X, y, X_test, y_test, best_classifiers, save_model, filename):
    # Soft voting for predictions (Forming an ensemble)
    print('*' * 30)
    print("PREDICITON")
    models = [(key, model) for key, model in best_classifiers.items()]
    ensemble = VotingClassifier(estimators=models, voting='soft')
    ensemble.fit(X, y)
    # To save model
    if save_model:
        pkl.dump(ensemble, open(filename, 'wb'))
    predictions = ensemble.predict_proba(X_test)[:, 1]
    print("Accuracy before tuning:", accuracy_score(y_test, predictions.astype(int)))
    return predictions_tuning(y_test, predictions)

In [93]:
def make_prediction(X_train, X_test, y_train, y_test, save_model, filename=''):
    # The main function consisting of the other functions above
    X_train, X_test = preprocessing(*eda(X_train, X_test))
    best_models = model_selection(X_train, y_train)
    return prediction(X_train, y_train, X_test, y_test, best_models, save_model, filename)

# Make a prediction

In [100]:
# Parameter setting
path_to_train = "./data/train.csv"
path_to_test = "./data/test.csv"
filename = "./models/model.pkl"
save_model = True

train = pd.read_csv(path_to_train)
test = pd.read_csv(path_to_test)
print("Train size:", train.shape[0], "\nTest size:", test.shape[0])

X_train, y_train = train.drop('Approved', axis=1), train['Approved']
X_test, y_test = test.drop('Approved', axis=1), test['Approved']

Train size: 590 
Test size: 100


In [101]:
# To do predictions
# And get progress information
predictions = make_prediction(X_train, X_test, y_train, y_test, save_model, filename)
print("Proportion of True predictions:", predictions.sum() / len(predictions))

MODEL SELECTION
Model: KNN
Score: 0.8108108108108109
Training time (mins): 0.26
Model: LogReg
Score: 0.831081081081081
Training time (mins): 0.0
Model: SVC
Score: 0.8378378378378378
Training time (mins): 0.02
Model: RandomForest
Score: 0.8378378378378378
Training time (mins): 0.19
Model: LGBM
Score: 0.8513513513513513
Training time (mins): 0.17
Model: CatBoost
Score: 0.831081081081081
Training time (mins): 1.34
Best models: SVC, RandomForest, LGBM
******************************
PREDICITON
Accuracy before tuning: 0.84
Accuracy after tuning: 0.9
Proportion of True predictions: 0.14


In [102]:
# Demonstration of the use of the saved model
with open(filename, 'rb') as model:
    loaded_model = pkl.load(model)
    X_train, X_test = preprocessing(*eda(X_train, X_test))
    print("Score from loaded model:", loaded_model.score(X_test, y_test))

Score from loaded model: 0.89
