In [143]:
# import libraries 

# utility/data wrangling
import pandas as pd
import numpy as np
from warnings import filterwarnings
import pickle 


# chart creations
import matplotlib.pyplot as plt
import seaborn as sns

# pre processing
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks # down sampling
from imblearn.over_sampling import SMOTE # up sampling


# model selection
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier


# Model Validation 
from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score, roc_curve

# statistical testing
from scipy.stats import f_oneway
from scipy import stats

# # py file
# import src

filterwarnings('ignore')
%matplotlib inline

In [4]:
# import data
df = pd.read_csv('../data/holdout_compatible_train.csv')
df.head()

Unnamed: 0,limit_bal,male,age,default,other,post_grad,university,married,deli,credit_utility,...,pay_4_-1,pay_4_0,pay_4_5,pay_5_-1,pay_5_0,pay_5_5,pay_6_-2,pay_6_-1,pay_6_2,pay_6_7
0,220000,0,36,1,0,1,0,0,0,4.8246,...,0,1,0,0,1,0,0,0,0,0
1,200000,0,29,0,0,0,0,0,-6,0.0,...,1,0,0,1,0,0,0,1,0,0
2,180000,0,27,0,0,1,0,0,-12,0.0,...,0,0,0,0,0,0,1,0,0,0
3,80000,1,32,0,0,0,1,0,0,3.373638,...,0,1,0,0,1,0,0,0,0,0
4,10000,1,27,1,0,0,1,0,0,2.661,...,0,1,0,0,1,0,0,0,0,0


In [None]:
poly_2 = PolynomialFeatures()

In [12]:
to_scale = [
    'limit_bal',
    'age',
]

In [14]:
log_scaled = np.log(df[to_scale])

In [190]:
log_scaled.columns = ['log_'+x for x in log_scaled.columns]
log_scaled

Unnamed: 0,log_log_limit_bal,log_log_age
0,12.301383,3.583519
1,12.206073,3.367296
2,12.100712,3.295837
3,11.289782,3.465736
4,9.210340,3.295837
...,...,...
22494,10.596635,3.637586
22495,12.765688,3.737670
22496,11.512925,3.828641
22497,9.903488,3.912023


In [191]:
df_log = pd.concat([df, log_scaled], axis=1)
df_log

Unnamed: 0,limit_bal,male,age,default,other,post_grad,university,married,deli,credit_utility,...,pay_4_5,pay_5_-1,pay_5_0,pay_5_5,pay_6_-2,pay_6_-1,pay_6_2,pay_6_7,log_log_limit_bal,log_log_age
0,220000,0,36,1,0,1,0,0,0,4.824600,...,0,0,1,0,0,0,0,0,12.301383,3.583519
1,200000,0,29,0,0,0,0,0,-6,0.000000,...,0,1,0,0,0,1,0,0,12.206073,3.367296
2,180000,0,27,0,0,1,0,0,-12,0.000000,...,0,0,0,0,1,0,0,0,12.100712,3.295837
3,80000,1,32,0,0,0,1,0,0,3.373638,...,0,0,1,0,0,0,0,0,11.289782,3.465736
4,10000,1,27,1,0,0,1,0,0,2.661000,...,0,0,1,0,0,0,0,0,9.210340,3.295837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22494,40000,0,38,1,0,0,1,1,9,5.599450,...,0,0,0,0,0,0,1,0,10.596635,3.637586
22495,350000,1,42,0,0,1,0,1,-6,-0.019574,...,0,1,0,0,0,1,0,0,12.765688,3.737670
22496,100000,0,46,0,0,0,0,0,3,0.058820,...,0,1,0,0,0,0,0,0,11.512925,3.828641
22497,20000,0,50,1,0,0,0,1,-8,0.227600,...,0,0,0,0,1,0,0,0,9.903488,3.912023


In [192]:
# split the data

X = df_log.drop(columns=['default'])
y = df_log[['default']]

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 44), (5625, 44), (16874, 1), (5625, 1))

In [193]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=100, 
                            l1_ratio=0.5
                            )

logReg.fit(X_train_scaled, y_train)
y_hat = logReg.predict(X_test_scaled)
f1_score(y_test, y_hat)
arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
delinquency_CV = np.mean(arr)    
print(f"the log scaled cross validation is {delinquency_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the log scaled cross validation is 0.5301275820443356


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.2s finished


# Tomek links

In [194]:
tl = TomekLinks()

In [205]:
# Instantiate the object
tl = TomekLinks()
# resample the data
X_res_train, y_res_train = tl.fit_resample(X_train.as_matrix(),y_train.as_matrix())

# conform back to data frames
df_X_train_res = pd.DataFrame(np.matrix(X_res_train))
df_y_train_res = pd.DataFrame(np.matrix(y_res_train))

# rename columns
df_X_train_res.columns = X_train.columns
df_y_train_res = df_y_train_res.T
df_y_train_res.columns = ['default']

# resample test data
X_res_test, y_res_test = tl.fit_sample(X_test.as_matrix(),y_test.as_matrix())

# conform back to data frames
df_X_res_test = pd.DataFrame(np.matrix(X_res_test))
df_y_res_test = pd.DataFrame(np.matrix(y_res_test))

# rename columns
df_X_res_test.columns = X_test.columns
df_y_res_test = df_y_res_test.T
df_y_res_test.columns = ['default']




In [98]:
# Instantiate the object
tl = TomekLinks()
# resample the data
X_res, y_res = tl.fit_resample(X.as_matrix(),y.as_matrix())

# conform back to data frames
df_X_res = pd.DataFrame(np.matrix(X_res))
df_y_res = pd.DataFrame(np.matrix(y_res))

# rename the columns
df_X_res.columns = X.columns
df_y_res = df_y_res.T
df_y_res.columns = ['default']

# join them back together
df_res=pd.concat([df_X_res, df_y_res],axis=1)

In [95]:
df_res

Unnamed: 0,default
0,1
1,0
2,0
3,0
4,1
...,...
20404,1
20405,0
20406,0
20407,1


In [146]:
X = df_res.drop(columns=['default'])
y = df_res[['default']]

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state =55)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((15306, 44), (5103, 44), (15306, 1), (5103, 1))

In [120]:
logReg.fit(X_train_scaled, y_train)
y_hat = logReg.predict(X_test_scaled)
print(f1_score(y_test, y_hat))
arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
delinquency_CV = np.mean(arr)    
print(f"the log scaled cross validation is {delinquency_CV}")

0.5593984962406015


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the log scaled cross validation is 0.5637507768202952


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.6s finished


# Grid search Log Reg

In [121]:
logReg_grid = LogisticRegression(penalty='elasticnet',
                                 solver='saga',
                                 verbose = 1, 
                                 n_jobs=-1,
                                 random_state=42 )
param_grid = {
    'C' : np.linspace(0.01, 1, 20),
    'l1_ratio' : np.linspace(0.01, 1, 20)
}

In [123]:
grid_clv = GridSearchCV(estimator=logReg_grid,
                        param_grid=param_grid,
                        scoring='f1',
                        n_jobs=-1, 
                        verbose=1, cv=10)
grid_clv.fit(X_train_scaled, y_train);

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 19.6min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 34.5min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 43.5min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 1 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.6s finished


In [126]:
# old params = {'C': 0.21842105263157896, 'l1_ratio': 0.8957894736842106}
grid_clv.best_params_

{'C': 0.6873684210526316, 'l1_ratio': 0.21842105263157896}

In [128]:
best_log_model = grid_clv.best_estimator_
best_log_model.fit(X_train_scaled, y_train)
y_hat = best_log_model.predict(X_test_scaled)
f1_score(y_test, y_hat)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 1 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.1s finished


0.49170437405731526

In [147]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=1000, 
                            l1_ratio=0.5
                            )
arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
np.mean(arr)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.9s finished


0.5698911747001935

In [148]:
logReg.fit(X_train_scaled, y_train)
y_hat = logReg.predict(X_test_scaled)
f1_score(y_test, y_hat)

0.5363565285379204

In [142]:
# Save the logistic regression model
scalar_pickle_out = open("../logReg_tomek.pickle","wb")
pickle.dump(logReg, scalar_pickle_out)
scalar_pickle_out.close()

scalar_pickle_out = open("../tomek_link_logReg.pickle","wb")
pickle.dump(tl, scalar_pickle_out)
scalar_pickle_out.close()

# Grid search KNN

In [173]:
knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=5, n_jobs=-1, p=5)

knn_clf.fit(X_train_scaled, y_train)
y_hat = knn_clf.predict(X_test_scaled)
f1_score(y_test, y_hat)

0.46311858076563966

In [181]:
vote_clf = VotingClassifier(
    estimators= [
        ('lr',logReg),
        ('knn', knn_clf)
    ],
    voting='soft'
    )
vote_clf1 = vote_clf.fit(X_train_scaled, y_train)

In [182]:
y_hat = vote_clf1.predict(X_test_scaled)

In [183]:
f1_score(y_test, y_hat)

0.5171645118145342

# Grid search a Random Forest

In [157]:
knn_clf = KNeighborsClassifier()

kNN_param_grid = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': range(1, 10, 1),
    'n_jobs': [-1],
    'p' : [2,3,4]
}

knn_grid = GridSearchCV(knn_clf,
                        param_grid=kNN_param_grid,
                        scoring='f1',
                        n_jobs=-1,
                        verbose=1,
                        cv=10)

knn_grid.fit(X_train_scaled, y_train.default)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [186]:
df_res

Unnamed: 0,limit_bal,male,age,other,post_grad,university,married,deli,credit_utility,bills,...,pay_5_-1,pay_5_0,pay_5_5,pay_6_-2,pay_6_-1,pay_6_2,pay_6_7,limit_bal.1,age.1,default
0,220000.0,0.0,36.0,0.0,1.0,0.0,0.0,0.0,4.824600,1250323.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.301383,3.583519,1
1,200000.0,0.0,29.0,0.0,0.0,0.0,0.0,-6.0,0.000000,1956.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,12.206073,3.367296,0
2,180000.0,0.0,27.0,0.0,1.0,0.0,0.0,-12.0,0.000000,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.100712,3.295837,0
3,80000.0,1.0,32.0,0.0,0.0,1.0,0.0,0.0,3.373638,279502.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11.289782,3.465736,0
4,10000.0,1.0,27.0,0.0,0.0,1.0,0.0,0.0,2.661000,31910.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9.210340,3.295837,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20404,40000.0,0.0,38.0,0.0,0.0,1.0,1.0,9.0,5.599450,234247.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.596635,3.637586,1
20405,350000.0,1.0,42.0,0.0,1.0,0.0,1.0,-6.0,-0.019574,24410.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,12.765688,3.737670,0
20406,100000.0,0.0,46.0,0.0,0.0,0.0,0.0,3.0,0.058820,24806.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,11.512925,3.828641,0
20407,20000.0,0.0,50.0,0.0,0.0,0.0,1.0,-8.0,0.227600,15502.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9.903488,3.912023,1
