In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,total_credit,sex,education,marrige,age,pay1_sep,bill1_sep,paid1_sep,pay2_aug,bill2_aug,paid2_aug,pay3_jul,bill3_jul,paid3_jul,pay4_jun,bill4_jun,paid4_jun,pay5_may,bill5_may,paid5_may,pay6_apr,bill6_apr,paid6_apr,next_month
0,20000.0,F,2,married,24,2,3913.0,0.0,2,3102.0,689.0,-1,689.0,0.0,-1,0.0,0.0,-2,0.0,0.0,-2,0.0,0.0,1
1,120000.0,F,2,single,26,-1,2682.0,0.0,2,1725.0,1000.0,0,2682.0,1000.0,0,3272.0,1000.0,0,3455.0,0.0,2,3261.0,2000.0,1
2,90000.0,F,2,single,34,0,29239.0,1518.0,0,14027.0,1500.0,0,13559.0,1000.0,0,14331.0,1000.0,0,14948.0,1000.0,0,15549.0,5000.0,0
3,50000.0,F,2,married,37,0,46990.0,2000.0,0,48233.0,2019.0,0,49291.0,1200.0,0,28314.0,1100.0,0,28959.0,1069.0,0,29547.0,1000.0,0
4,50000.0,M,2,married,57,-1,8617.0,2000.0,0,5670.0,36681.0,-1,35835.0,10000.0,0,20940.0,9000.0,0,19146.0,689.0,0,19131.0,679.0,0


### Details of Payment and columns: -
    Pay : Repayment status
    Bill : Amount of bill statement
    Paid : Amount of previous payment
    -2 = Balance paid in full and no transactions this period (we may refer to this credit card account as having been 'inactive' this period)
    -1 = Balance paid in full, but account has a positive balance at end of period due to recent transactions for which payment has not yet come due
    0 = Customer paid the minimum due amount, but not the entire balance. I.e., the customer paid enough for their account to remain in good standing, but did revolve a balance
    1 = payment delay for one month
    2 = payment delay for two months
    . 
    . 
    .
    8 = payment delay for eight months
    9 = payment delay for nine months and above.

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [4]:
from sklearn.metrics import accuracy_score

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

In [6]:
models = {
    'LogisticRegression': LogisticRegression(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBClassifier': XGBClassifier(),
    'CatBoostClassifier': CatBoostClassifier()
}

In [7]:
params = {
    'LogisticRegression': {
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky'],
        'max_iter': [100, 250, 500, 750, 1000]
    },
    'KNeighborsClassifier': {
        'n_neighbors' : [5,9,13,15],
        'weights' : ['uniform','distance'],
        'metric' : ['minkowski','euclidean','manhattan']
    },
    'SVC': {
#         'C': [0.1, 1, 10, 100, 1000], 
#         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
    },
    'DecisionTreeClassifier': {
#         'criterion': ['gini', 'entropy', 'log_loss'],
#         'splitter': ['best','random'],
        'max_depth': range(5, 15, 3),
        'min_samples_split': range(8, 16, 2),
        'min_samples_leaf': range(5, 15, 3),
#         'max_features': ['sqrt','log2']
    },
    'RandomForestClassifier': {
#         'n_estimators': [25, 50, 75, 100],
#         'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': range(5, 15, 3),
        'min_samples_split': range(8, 16, 2),
        'min_samples_leaf': range(5, 15, 3),
#         'max_features': ['sqrt','log2']
    },
    'GradientBoostingClassifier': {
#         'n_estimators': [25, 50, 75, 100],
#         'loss':['log_loss', 'exponential'],
#         'criterion':['friedman_mse','squared_error'],
        'max_depth': range(5, 15, 3),
        'min_samples_split': range(8, 16, 2),
        'min_samples_leaf': range(5, 15, 3),
#         'max_features': ['sqrt','log2']
#         'learning_rate': [1,0.5,.1, .01, .05, .001],
#         'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9],
    },
    'AdaBoostClassifier': {},
    'XGBClassifier': {},
    'CatBoostClassifier': {}
}

In [8]:
x = data.drop('next_month', axis=1)
y = data.iloc[:,-1]
categorical_features = [i for i in data.columns if data[i].dtype=='O']
numerical_features = ['total_credit', 'bill1_sep', 'paid1_sep', 'bill2_aug', 'paid2_aug', 'bill3_jul', 'paid3_jul', 'bill4_jun',
       'paid4_jun', 'bill5_may', 'paid5_may', 'bill6_apr', 'paid6_apr']
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=41)

In [9]:
def transformer_pipe(categorical_features, numerical_features):
    num_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
    cat_pipe = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
    transformer_obj = ColumnTransformer([
        ('numerical', num_pipe, numerical_features),
        ('categorical', cat_pipe, categorical_features)
    ], remainder='passthrough')
    return transformer_obj

In [10]:
transformer_obj = transformer_pipe(categorical_features, numerical_features)

In [11]:
train_array = transformer_obj.fit_transform(x_train)
test_array = transformer_obj.fit_transform(x_test)

In [12]:
def test_models(models, train_array, y_train, test_array, y_test):
    result = {}
    for i in models:
        model = models[i]
        clf = model.fit(train_array, y_train)
        pred_train = clf.predict(train_array)
        train_score = np.round(accuracy_score(y_train, pred_train)*100, 2)
        pred_test = clf.predict(test_array)
        test_score = np.round(accuracy_score(y_test, pred_test)*100, 2)
        result[str(i)] = [train_score, test_score]
    return result

In [13]:
result = test_models(models, train_array, y_train, test_array, y_test)

Learning rate set to 0.047042
0:	learn: 0.6817789	total: 138ms	remaining: 2m 17s
1:	learn: 0.6706926	total: 162ms	remaining: 1m 20s
2:	learn: 0.6602681	total: 173ms	remaining: 57.6s
3:	learn: 0.6510113	total: 184ms	remaining: 45.9s
4:	learn: 0.6433532	total: 195ms	remaining: 38.8s
5:	learn: 0.6367603	total: 207ms	remaining: 34.3s
6:	learn: 0.6299205	total: 218ms	remaining: 31s
7:	learn: 0.6234460	total: 229ms	remaining: 28.4s
8:	learn: 0.6179501	total: 241ms	remaining: 26.5s
9:	learn: 0.6128247	total: 252ms	remaining: 24.9s
10:	learn: 0.6086958	total: 265ms	remaining: 23.8s
11:	learn: 0.6044711	total: 277ms	remaining: 22.8s
12:	learn: 0.6009301	total: 289ms	remaining: 21.9s
13:	learn: 0.5978368	total: 300ms	remaining: 21.1s
14:	learn: 0.5949795	total: 313ms	remaining: 20.6s
15:	learn: 0.5922677	total: 325ms	remaining: 20s
16:	learn: 0.5894435	total: 338ms	remaining: 19.5s
17:	learn: 0.5871771	total: 351ms	remaining: 19.1s
18:	learn: 0.5847786	total: 363ms	remaining: 18.7s
19:	learn: 0.

165:	learn: 0.5248382	total: 2.11s	remaining: 10.6s
166:	learn: 0.5246148	total: 2.12s	remaining: 10.6s
167:	learn: 0.5245145	total: 2.13s	remaining: 10.5s
168:	learn: 0.5243680	total: 2.14s	remaining: 10.5s
169:	learn: 0.5241822	total: 2.15s	remaining: 10.5s
170:	learn: 0.5240388	total: 2.16s	remaining: 10.5s
171:	learn: 0.5237883	total: 2.17s	remaining: 10.4s
172:	learn: 0.5235292	total: 2.18s	remaining: 10.4s
173:	learn: 0.5234278	total: 2.19s	remaining: 10.4s
174:	learn: 0.5232056	total: 2.2s	remaining: 10.4s
175:	learn: 0.5230406	total: 2.21s	remaining: 10.4s
176:	learn: 0.5227662	total: 2.23s	remaining: 10.3s
177:	learn: 0.5224790	total: 2.24s	remaining: 10.3s
178:	learn: 0.5223676	total: 2.25s	remaining: 10.3s
179:	learn: 0.5221761	total: 2.26s	remaining: 10.3s
180:	learn: 0.5220667	total: 2.27s	remaining: 10.3s
181:	learn: 0.5219354	total: 2.28s	remaining: 10.2s
182:	learn: 0.5217485	total: 2.29s	remaining: 10.2s
183:	learn: 0.5216197	total: 2.3s	remaining: 10.2s
184:	learn: 0.

326:	learn: 0.4946566	total: 3.98s	remaining: 8.19s
327:	learn: 0.4944289	total: 3.99s	remaining: 8.17s
328:	learn: 0.4941552	total: 4s	remaining: 8.16s
329:	learn: 0.4939200	total: 4.01s	remaining: 8.14s
330:	learn: 0.4937388	total: 4.02s	remaining: 8.13s
331:	learn: 0.4935988	total: 4.03s	remaining: 8.12s
332:	learn: 0.4934243	total: 4.04s	remaining: 8.1s
333:	learn: 0.4931935	total: 4.06s	remaining: 8.09s
334:	learn: 0.4929865	total: 4.07s	remaining: 8.08s
335:	learn: 0.4927916	total: 4.08s	remaining: 8.06s
336:	learn: 0.4926273	total: 4.09s	remaining: 8.05s
337:	learn: 0.4924129	total: 4.1s	remaining: 8.03s
338:	learn: 0.4921536	total: 4.11s	remaining: 8.02s
339:	learn: 0.4919787	total: 4.12s	remaining: 8s
340:	learn: 0.4918056	total: 4.13s	remaining: 7.99s
341:	learn: 0.4916658	total: 4.14s	remaining: 7.97s
342:	learn: 0.4915103	total: 4.16s	remaining: 7.96s
343:	learn: 0.4913157	total: 4.17s	remaining: 7.95s
344:	learn: 0.4911960	total: 4.18s	remaining: 7.93s
345:	learn: 0.491119

499:	learn: 0.4677111	total: 5.89s	remaining: 5.89s
500:	learn: 0.4675973	total: 5.91s	remaining: 5.88s
501:	learn: 0.4674185	total: 5.92s	remaining: 5.87s
502:	learn: 0.4672764	total: 5.93s	remaining: 5.86s
503:	learn: 0.4671390	total: 5.94s	remaining: 5.84s
504:	learn: 0.4670605	total: 5.96s	remaining: 5.84s
505:	learn: 0.4669557	total: 5.97s	remaining: 5.83s
506:	learn: 0.4668497	total: 5.99s	remaining: 5.82s
507:	learn: 0.4666933	total: 6s	remaining: 5.81s
508:	learn: 0.4665186	total: 6.02s	remaining: 5.8s
509:	learn: 0.4663639	total: 6.04s	remaining: 5.8s
510:	learn: 0.4661999	total: 6.05s	remaining: 5.79s
511:	learn: 0.4659992	total: 6.07s	remaining: 5.79s
512:	learn: 0.4658942	total: 6.09s	remaining: 5.78s
513:	learn: 0.4657962	total: 6.11s	remaining: 5.77s
514:	learn: 0.4656552	total: 6.12s	remaining: 5.76s
515:	learn: 0.4654803	total: 6.13s	remaining: 5.75s
516:	learn: 0.4653731	total: 6.14s	remaining: 5.74s
517:	learn: 0.4652554	total: 6.15s	remaining: 5.73s
518:	learn: 0.465

661:	learn: 0.4456437	total: 7.8s	remaining: 3.98s
662:	learn: 0.4454650	total: 7.81s	remaining: 3.97s
663:	learn: 0.4453476	total: 7.82s	remaining: 3.96s
664:	learn: 0.4451722	total: 7.83s	remaining: 3.95s
665:	learn: 0.4451321	total: 7.84s	remaining: 3.93s
666:	learn: 0.4450234	total: 7.85s	remaining: 3.92s
667:	learn: 0.4448631	total: 7.86s	remaining: 3.91s
668:	learn: 0.4447517	total: 7.88s	remaining: 3.9s
669:	learn: 0.4446623	total: 7.89s	remaining: 3.88s
670:	learn: 0.4445123	total: 7.9s	remaining: 3.87s
671:	learn: 0.4443809	total: 7.91s	remaining: 3.86s
672:	learn: 0.4441959	total: 7.92s	remaining: 3.85s
673:	learn: 0.4440729	total: 7.93s	remaining: 3.83s
674:	learn: 0.4439037	total: 7.94s	remaining: 3.82s
675:	learn: 0.4438329	total: 7.95s	remaining: 3.81s
676:	learn: 0.4437723	total: 7.96s	remaining: 3.8s
677:	learn: 0.4435773	total: 7.97s	remaining: 3.78s
678:	learn: 0.4434360	total: 7.98s	remaining: 3.77s
679:	learn: 0.4433138	total: 7.99s	remaining: 3.76s
680:	learn: 0.44

822:	learn: 0.4264025	total: 9.7s	remaining: 2.09s
823:	learn: 0.4262832	total: 9.71s	remaining: 2.07s
824:	learn: 0.4261632	total: 9.72s	remaining: 2.06s
825:	learn: 0.4260166	total: 9.73s	remaining: 2.05s
826:	learn: 0.4258429	total: 9.74s	remaining: 2.04s
827:	learn: 0.4256887	total: 9.76s	remaining: 2.03s
828:	learn: 0.4255739	total: 9.77s	remaining: 2.01s
829:	learn: 0.4254440	total: 9.78s	remaining: 2s
830:	learn: 0.4253751	total: 9.79s	remaining: 1.99s
831:	learn: 0.4252549	total: 9.79s	remaining: 1.98s
832:	learn: 0.4251476	total: 9.81s	remaining: 1.97s
833:	learn: 0.4250004	total: 9.83s	remaining: 1.96s
834:	learn: 0.4249213	total: 9.84s	remaining: 1.94s
835:	learn: 0.4247372	total: 9.85s	remaining: 1.93s
836:	learn: 0.4246127	total: 9.86s	remaining: 1.92s
837:	learn: 0.4244938	total: 9.87s	remaining: 1.91s
838:	learn: 0.4243171	total: 9.88s	remaining: 1.9s
839:	learn: 0.4242408	total: 9.89s	remaining: 1.88s
840:	learn: 0.4240700	total: 9.9s	remaining: 1.87s
841:	learn: 0.4239

In [15]:
pprint(result)

{'AdaBoostClassifier': [71.38, 70.14],
 'CatBoostClassifier': [82.86, 73.73],
 'DecisionTreeClassifier': [99.96, 73.27],
 'GradientBoostingClassifier': [73.29, 70.45],
 'KNeighborsClassifier': [83.77, 74.6],
 'LogisticRegression': [67.5, 66.89],
 'RandomForestClassifier': [99.96, 83.68],
 'SVC': [69.12, 68.19],
 'XGBClassifier': [87.52, 72.77]}
