In [1]:
import pandas as pd
import numpy as np

from models.logistic_regression import *
from models.knn import *
from models.decision_tree import *
from models.random_forest import *


from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

seed = 10
np.random.seed(seed)

import warnings
warnings.filterwarnings("ignore")

In [2]:
def custom_CV(model, X, y, k_folds=5):
    kf = KFold(n_splits=k_folds, random_state=seed, shuffle=True)
    scores = np.zeros(k_folds)    
    for i, (train_index, val_index) in enumerate(kf.split(X, y)):
        X_train, y_train = X.loc[train_index].to_numpy(), y.loc[train_index].to_numpy()
        X_val, y_val = X.loc[val_index].to_numpy(), y.loc[val_index].to_numpy()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        scores[i] = accuracy_score(y_val, y_pred)
    return scores

# Credit risk dataset

In [3]:
df = pd.read_csv('../data/modified_credit_risk.csv')
X, y = df.drop(['label'], axis=1), df['label']

### Logistic regression

In [4]:
%%time
lr = LogisticRegression()
scores = custom_CV(lr, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.80 +/- 0.03
CPU times: user 110 ms, sys: 67.5 ms, total: 178 ms
Wall time: 62.5 ms


In [5]:
%%time
lr = LogRegression()
scores = custom_CV(lr, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.80 +/- 0.03
CPU times: user 590 ms, sys: 360 ms, total: 949 ms
Wall time: 268 ms


### KNN

In [6]:
%%time
knn = KNeighborsClassifier()
scores = custom_CV(knn, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.77 +/- 0.04
CPU times: user 167 ms, sys: 85.1 ms, total: 252 ms
Wall time: 138 ms


In [7]:
%%time
knn = KNNClassifier()
scores = custom_CV(knn, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.77 +/- 0.04
CPU times: user 344 ms, sys: 232 ms, total: 576 ms
Wall time: 168 ms


### Decision tree

In [8]:
%%time
dtc = DecisionTreeClassifier()
scores = custom_CV(dtc, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.67 +/- 0.07
CPU times: user 94.6 ms, sys: 22.8 ms, total: 117 ms
Wall time: 53.4 ms


In [9]:
%%time
dtc = DTClassifier()
scores = custom_CV(dtc, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.80 +/- 0.03
CPU times: user 890 ms, sys: 4.14 ms, total: 894 ms
Wall time: 895 ms


### Random Forest

In [10]:
%%time
rfc = RandomForestClassifier()
scores = custom_CV(rfc, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.78 +/- 0.04
CPU times: user 104 ms, sys: 32 µs, total: 104 ms
Wall time: 104 ms


In [11]:
%%time
rfc = RFClassifier()
scores = custom_CV(rfc, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.80 +/- 0.03
CPU times: user 4.24 s, sys: 0 ns, total: 4.24 s
Wall time: 4.24 s


# Churn modelling dataset

In [12]:
df = pd.read_csv('../data/modified_churn_modelling.csv')
X, y = df.drop(['Exited'], axis=1), df['Exited']

### Logistic regression

In [13]:
%%time
lr = LogisticRegression()
scores = custom_CV(lr, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.83 +/- 0.01
CPU times: user 419 ms, sys: 235 ms, total: 654 ms
Wall time: 220 ms


In [14]:
%%time
lr = LogRegression()
scores = custom_CV(lr, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.80 +/- 0.01
CPU times: user 2.29 s, sys: 1.57 s, total: 3.86 s
Wall time: 1.28 s


### KNN

In [15]:
%%time
knn = KNeighborsClassifier()
scores = custom_CV(knn, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.82 +/- 0.01
CPU times: user 828 ms, sys: 65.4 ms, total: 893 ms
Wall time: 783 ms


In [16]:
%%time
knn = KNNClassifier()
scores = custom_CV(knn, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.82 +/- 0.01
CPU times: user 6.21 s, sys: 842 ms, total: 7.05 s
Wall time: 5.68 s


### Decision tree

In [17]:
%%time
dtc = DecisionTreeClassifier()
scores = custom_CV(dtc, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.79 +/- 0.02
CPU times: user 165 ms, sys: 282 µs, total: 165 ms
Wall time: 164 ms


In [18]:
%%time
dtc = DTClassifier()
scores = custom_CV(dtc, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.82 +/- 0.01
CPU times: user 3.59 s, sys: 10.8 ms, total: 3.6 s
Wall time: 3.6 s


### Random Forest

In [19]:
%%time
rfc = RandomForestClassifier()
scores = custom_CV(rfc, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.84 +/- 0.01
CPU times: user 369 ms, sys: 0 ns, total: 369 ms
Wall time: 368 ms


In [20]:
%%time
rfc = RFClassifier()
scores = custom_CV(rfc, X, y)
print("Acc: %0.2f +/- %0.2f" % (scores.mean(), scores.std() * 2))

Acc: 0.80 +/- 0.01
CPU times: user 50 s, sys: 126 ms, total: 50.1 s
Wall time: 50.3 s
