In [1]:
# Initial configuration
# import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split 




In [2]:
# Load data
df = pd.read_csv('./data/UCI_Credit_Card.csv')

df.set_index('ID' , inplace = True)
df.rename(columns={'PAY_0':'PAY_1'}, inplace=True)

In [3]:
# Separate features to target, train to test
features = df.drop('default.payment.next.month' ,axis = 1)
target = df['default.payment.next.month']
x_train,x_test,y_train,y_test = train_test_split(features,target,test_size = 0.2)

In [4]:
# Find the best algorithm to the problem (in a very brute and careless way)

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator

from dataclasses import dataclass,field
import typing

@dataclass
class Modelshowcaser:
    model : BaseEstimator
    acc: float
    log_loss: float
    @classmethod
    def train_and_evaluate(cls, model: BaseEstimator, x_train, y_train, x_test, y_test):
        model.fit(x_train, y_train)
        train_predictions = model.predict(x_test)
        acc = accuracy_score(y_test, train_predictions)
        train_predictions = model.predict_proba(x_test)
        ll = log_loss(y_test, train_predictions)
        return cls(model, acc, ll)

@dataclass
class ShowCase:
    models : typing.List[Modelshowcaser] = field(default_factory=list)
    @classmethod
    def train_and_evaluate(cls, models: typing.List[BaseEstimator], x_train, y_train, x_test, y_test):
        _models = list()
        for m in models:
            print(f"Training {m.__class__.__name__}")
            m_c = Modelshowcaser.train_and_evaluate(m, x_train, y_train, x_test, y_test)
            _models.append(m_c)
        return cls(models=_models)
    def sort_by_acc(self):
        self.models = sorted([m for m in self.models], key=lambda x: x.acc, reverse=True)
        return self
    def sort_by_log_loss(self) -> None:
        self.models = sorted([m for m in self.models], key=lambda x: x.log_loss, reverse=False)
        return self


#

In [5]:
classifiers = [
    ExtraTreesClassifier(n_estimators=10),
    KNeighborsClassifier(3),
    RandomForestClassifier(),
#     SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier()
]

c = ShowCase.train_and_evaluate(classifiers,  x_train, y_train, x_test, y_test)

Training ExtraTreesClassifier
Training KNeighborsClassifier
Training RandomForestClassifier
Training DecisionTreeClassifier


In [6]:
c.sort_by_log_loss()

showcase(models=[Modelshowcaser(model=RandomForestClassifier(), acc=0.8183333333333334, log_loss=0.45464423030780254), Modelshowcaser(model=ExtraTreesClassifier(n_estimators=10), acc=0.8045, log_loss=1.2203769109276406), Modelshowcaser(model=KNeighborsClassifier(n_neighbors=3), acc=0.7371666666666666, log_loss=3.880229955538274), Modelshowcaser(model=DecisionTreeClassifier(), acc=0.7233333333333334, log_loss=9.556277442069621)])

In [7]:
c.sort_by_acc()

showcase(models=[Modelshowcaser(model=RandomForestClassifier(), acc=0.8183333333333334, log_loss=0.45464423030780254), Modelshowcaser(model=ExtraTreesClassifier(n_estimators=10), acc=0.8045, log_loss=1.2203769109276406), Modelshowcaser(model=KNeighborsClassifier(n_neighbors=3), acc=0.7371666666666666, log_loss=3.880229955538274), Modelshowcaser(model=DecisionTreeClassifier(), acc=0.7233333333333334, log_loss=9.556277442069621)])

In [8]:
winner = c.models[0].model

In [None]:
winner

In [13]:
# save winner
import joblib
import os
model_name = 'model.joblib'
joblib.dump(winner, model_name)


['../model.joblib']