In [56]:
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier

In [57]:
df = pd.read_csv('UCI_Credit_Card.csv')

In [58]:
X = df.drop('default.payment.next.month', axis=1)
y = df['default.payment.next.month']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [60]:
# RF

In [61]:
classifier = RandomForestClassifier(n_estimators=100, random_state=101)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [62]:
accuracy_random_forest = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_random_forest * 100}%')

Accuracy: 82.08333333333333%


In [63]:
# LGBM

In [64]:
model = LGBMClassifier(
    objective='binary',
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    random_state=101
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

[LightGBM] [Info] Number of positive: 5326, number of negative: 18674
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000967 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3521
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221917 -> initscore=-1.254532
[LightGBM] [Info] Start training from score -1.254532


In [65]:
accuracy_lgbm = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_lgbm * 100}%')

Accuracy: 82.3%


In [66]:
# GBM

In [67]:
gbm = GradientBoostingClassifier(
    n_estimators=300,      
    learning_rate=0.05,     
    max_depth=3,           
    subsample=0.8,         
    random_state=101
)
gbm.fit(X_train, y_train)
y_pred = gbm.predict(X_test)

In [68]:
accuracy_gbm = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_gbm * 100}%')

Accuracy: 82.33333333333334%


In [69]:
# ADA

In [70]:
base_tree = DecisionTreeClassifier(
    max_depth=1,      
    random_state=101
)
ada = AdaBoostClassifier(
    estimator=base_tree,
    n_estimators=200,     
    learning_rate=0.05,     
    random_state=101
)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)

In [71]:
accuracy_ada = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_ada * 100}%')

Accuracy: 82.11666666666667%


In [72]:
# CAT

In [73]:
cat = CatBoostClassifier(
    iterations=500,          
    learning_rate=0.05,      
    depth=6,                 
    loss_function='Logloss', 
    eval_metric='Accuracy',
    random_seed=101,
    verbose=False           
)
cat.fit(X_train, y_train)
y_pred = cat.predict(X_test)

In [74]:
accuracy_cat = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_cat * 100}%')

Accuracy: 82.15%


In [83]:
series = pd.Series([accuracy_random_forest, accuracy_lgbm, accuracy_gbm, accuracy_ada, accuracy_cat], index = ['RF', 'LGBM', 'GBM', 'ADA', 'CAT'])
series = (series * 100)
series_sorted = series.sort_values(ascending=False)
print(series_sorted)

GBM     82.333333
LGBM    82.300000
CAT     82.150000
ADA     82.116667
RF      82.083333
dtype: float64
