In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
import dill
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,total_credit,sex,education,marrige,age,pay1_sep,bill1_sep,paid1_sep,pay2_aug,bill2_aug,paid2_aug,pay3_jul,bill3_jul,paid3_jul,pay4_jun,bill4_jun,paid4_jun,pay5_may,bill5_may,paid5_may,pay6_apr,bill6_apr,paid6_apr,next_month
0,20000.0,F,2,married,24,2,3913.0,0.0,2,3102.0,689.0,-1,689.0,0.0,-1,0.0,0.0,-2,0.0,0.0,-2,0.0,0.0,1
1,120000.0,F,2,single,26,-1,2682.0,0.0,2,1725.0,1000.0,0,2682.0,1000.0,0,3272.0,1000.0,0,3455.0,0.0,2,3261.0,2000.0,1
2,90000.0,F,2,single,34,0,29239.0,1518.0,0,14027.0,1500.0,0,13559.0,1000.0,0,14331.0,1000.0,0,14948.0,1000.0,0,15549.0,5000.0,0
3,50000.0,F,2,married,37,0,46990.0,2000.0,0,48233.0,2019.0,0,49291.0,1200.0,0,28314.0,1100.0,0,28959.0,1069.0,0,29547.0,1000.0,0
4,50000.0,M,2,married,57,-1,8617.0,2000.0,0,5670.0,36681.0,-1,35835.0,10000.0,0,20940.0,9000.0,0,19146.0,689.0,0,19131.0,679.0,0


### Details of Payment and columns: -
    Pay : Repayment status
    Bill : Amount of bill statement
    Paid : Amount of previous payment
    -2 = Balance paid in full and no transactions this period (we may refer to this credit card account as having been 'inactive' this period)
    -1 = Balance paid in full, but account has a positive balance at end of period due to recent transactions for which payment has not yet come due
    0 = Customer paid the minimum due amount, but not the entire balance. I.e., the customer paid enough for their account to remain in good standing, but did revolve a balance
    1 = payment delay for one month
    2 = payment delay for two months
    . 
    . 
    .
    8 = payment delay for eight months
    9 = payment delay for nine months and above.

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

In [4]:
from sklearn.metrics import accuracy_score

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [6]:
models = {
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier()
}

In [7]:
params = {
    'LogisticRegression': {
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
        'max_iter': [100, 250, 500, 750, 1000]
    },
    'KNeighborsClassifier': {
        'n_neighbors' : [5,9,13,15],
        'weights' : ['uniform','distance'],
        'metric' : ['minkowski','euclidean','manhattan']
    },
    'SVC': {
#         'C': [0.1, 1, 10, 100, 1000], 
#         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
    },
    'DecisionTreeClassifier': {
#         'criterion': ['gini', 'entropy', 'log_loss'],
#         'splitter': ['best','random'],
        'max_depth': range(5, 15, 3),
#         'min_samples_split': range(8, 16, 2),
#         'min_samples_leaf': range(5, 15, 3),
#         'max_features': ['sqrt','log2']
    },
    'RandomForestClassifier': {
#         'n_estimators': [25, 50, 75, 100],
#         'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': range(5, 15, 3),
#         'min_samples_split': range(8, 16, 2),
#         'min_samples_leaf': range(5, 15, 3),
#         'max_features': ['sqrt','log2']
    },
    'GradientBoostingClassifier': {
#         'n_estimators': [25, 50, 75, 100],
#         'loss':['log_loss', 'exponential'],
#         'criterion':['friedman_mse','squared_error'],
        'max_depth': range(5, 15, 3),
#         'min_samples_split': range(8, 16, 2),
#         'min_samples_leaf': range(5, 15, 3),
#         'max_features': ['sqrt','log2']
#         'learning_rate': [1,0.5,.1, .01, .05, .001],
#         'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
    },
    'AdaBoostClassifier': {
        'n_estimators': [25, 50, 75, 100],
#         'learning_rate': [1,0.5,.1, .01, .05, .001]
    }
}

In [8]:
x = data.drop('next_month', axis=1)
y = data.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=41)

In [9]:
def transformer_pipe(categorical_features, numerical_features):
    num_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
    cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
    transformer_obj = ColumnTransformer([
        ('numerical', num_pipe, numerical_features),
        ('categorical', cat_pipe, categorical_features)
    ], remainder='passthrough')
    return transformer_obj

In [10]:
categorical_features = ['sex', 'marrige']
numerical_features = ['total_credit', 'bill1_sep', 'paid1_sep', 'bill2_aug', 'paid2_aug', 'bill3_jul', 'paid3_jul', 'bill4_jun',
       'paid4_jun', 'bill5_may', 'paid5_may', 'bill6_apr', 'paid6_apr']
transformer_obj = transformer_pipe(categorical_features, numerical_features)

In [11]:
with open('preprocessor.pkl', 'wb') as file:
    preprocessor = transformer_obj.fit(x_train)
    dill.dump(preprocessor, file)

In [12]:
with open('preprocessor.pkl', 'rb') as file:
    preprocessor = dill.load(file)
    train_array = preprocessor.transform(x_train)
    test_array = preprocessor.transform(x_test)

In [13]:
def test_models(models):
    result = {}
    for i in models:
        model = models[i]
        clf = model.fit(train_array, y_train)

        pred_train = clf.predict(train_array)
        train_score = np.round(accuracy_score(y_train, pred_train)*100, 2)
        pred_test = clf.predict(test_array)
        test_score = np.round(accuracy_score(y_test, pred_test)*100, 2)

        result[str(i)] = [train_score, test_score]
    return result

In [14]:
models_score = test_models(models)

In [15]:
pprint(models_score)

{'AdaBoostClassifier': [71.38, 70.51],
 'DecisionTreeClassifier': [99.96, 86.94],
 'GradientBoostingClassifier': [73.29, 71.54],
 'KNeighborsClassifier': [83.77, 74.67],
 'LogisticRegression': [67.5, 66.95],
 'RandomForestClassifier': [99.96, 92.58],
 'SVC': [69.12, 68.22]}


In [16]:
def test_tuned_models(models, params):
    result = {}
    for i in models:
        model = models[i]
        param = params[i]
        gs = GridSearchCV(model, param, scoring='accuracy', n_jobs=8, verbose=2, cv=5, error_score='raise')
        gs.fit(train_array, y_train)
        model.set_params(**gs.best_params_)
        clf = model.fit(train_array, y_train)

        pred_train = clf.predict(train_array)
        train_score = np.round(accuracy_score(y_train, pred_train)*100, 2)
        pred_test = clf.predict(test_array)
        test_score = np.round(accuracy_score(y_test, pred_test)*100, 2)

        result[str(i)] = [train_score, test_score, gs.best_params_]
    return result

In [None]:
tuned_models_score = test_tuned_models(models, params)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
pprint(tuned_models_score)