In [40]:
# import libraries 

# utility/data wrangling
import pandas as pd
import numpy as np
from warnings import filterwarnings
import pickle 


# chart creations
import matplotlib.pyplot as plt
import seaborn as sns

# pre processing
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks # down sampling
from imblearn.over_sampling import SMOTE # up sampling


# model selection
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


# Model Validation 
from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score, roc_curve

# statistical testing
from scipy.stats import f_oneway
from scipy import stats

# # py file
# import src

filterwarnings('ignore')
%matplotlib inline

In [10]:
# import data
df_train = pd.read_csv('../data/holdout_compatible_train.csv')
df_holdout = pd.read_csv('../data/holdout_final.csv')
df_holdout.head()

Unnamed: 0,limit_bal,male,age,other,post_grad,university,married,deli,credit_utility,bills,...,pay_4_-1,pay_4_0,pay_4_5,pay_5_-1,pay_5_0,pay_5_5,pay_6_-2,pay_6_-1,pay_6_2,pay_6_7
0,0.102188,-0.809241,0.92754,-0.126515,-0.737375,1.068571,1.09753,0.182284,1.610877,1.917156,...,-0.48624,0.909141,-0.033722,-0.475185,0.878918,-0.027227,-0.440771,-0.487792,-0.317849,-0.041611
1,-0.283096,-0.809241,1.360372,-0.126515,-0.737375,1.068571,1.09753,-1.847078,-0.979926,-0.692912,...,-0.48624,-1.099939,-0.033722,-0.475185,-1.137763,-0.027227,2.268752,-0.487792,-0.317849,-0.041611
2,-0.822493,-0.809241,0.819332,-0.126515,1.356162,-0.935829,1.09753,0.689625,-0.964677,-0.700861,...,-0.48624,0.909141,-0.033722,-0.475185,0.878918,-0.027227,-0.440771,2.050055,-0.317849,-0.041611
3,0.564529,1.235726,0.711124,-0.126515,1.356162,-0.935829,1.09753,0.182284,-0.336708,0.227059,...,-0.48624,0.909141,-0.033722,-0.475185,0.878918,-0.027227,-0.440771,-0.487792,-0.317849,-0.041611
4,-0.514266,-0.809241,-0.803787,-0.126515,-0.737375,1.068571,1.09753,0.858738,0.902294,0.338219,...,-0.48624,0.909141,-0.033722,-0.475185,0.878918,-0.027227,-0.440771,-0.487792,3.146153,-0.041611


In [11]:
poly_2 = PolynomialFeatures()

# Log scale continuous features

In [12]:
to_scale = [
    'limit_bal',
    'age',
]

In [13]:
# Train
log_scaled_train = np.log(df_train[to_scale])
# Holdout
log_scaled_holdout = np.log(df_holdout[to_scale])

In [14]:
# rename the columns
log_scaled_train.columns = ['log_'+x for x in log_scaled_train.columns]

# rename columns
log_scaled_holdout.columns = ['log_'+x for x in log_scaled_holdout.columns]

In [15]:
# Include the log scaled features in my train data set
df_log_train = pd.concat([df_train, log_scaled_train], axis=1)

# Include the log sclaed features in my holdout data set
df_log_holdout = pd.concat([df_holdout, log_scaled_holdout], axis=1)

In [16]:
df_log_holdout.columns == df_log_train.drop(columns=['default']).columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True])

# Holdup complete up to here aka no scaling

In [17]:
# split the data

X = df_log_train.drop(columns=['default'])
y = df_log_train[['default']]

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

# same scal

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 44), (5625, 44), (16874, 1), (5625, 1))

In [18]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=100, 
                            l1_ratio=0.5
                            )

logReg.fit(X_train_scaled, y_train)
y_hat = logReg.predict(X_test_scaled)
f1_score(y_test, y_hat)
arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
delinquency_CV = np.mean(arr)    
print(f"the log scaled cross validation is {delinquency_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the log scaled cross validation is 0.5301275820443356


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   10.2s finished


# Tomek links

## preventing leakage

In [194]:
tl = TomekLinks()

In [23]:
# # Instantiate the object
# tl = TomekLinks()
# # resample the data
# X_res_train, y_res_train = tl.fit_resample(X_train.as_matrix(),y_train.as_matrix())

# # conform back to data frames
# df_X_train_res = pd.DataFrame(np.matrix(X_res_train))
# df_y_train_res = pd.DataFrame(np.matrix(y_res_train))

# # rename columns
# df_X_train_res.columns = X_train.columns
# df_y_train_res = df_y_train_res.T
# df_y_train_res.columns = ['default']

# # Make the res X


# # resample test data
# X_res_test, y_res_test = tl.fit_sample(X_test.as_matrix(),y_test.as_matrix())

# # conform back to data frames
# df_X_res_test = pd.DataFrame(np.matrix(X_res_test))
# df_y_res_test = pd.DataFrame(np.matrix(y_res_test))

# # rename columns
# df_X_res_test.columns = X_test.columns
# df_y_res_test = df_y_res_test.T
# df_y_res_test.columns = ['default']




In [25]:
# logReg = LogisticRegression(class_weight = 'balanced',
#                             C=0.01,
#                             solver='saga', 
#                             penalty='elasticnet',
#                             max_iter=100, 
#                             l1_ratio=0.5
#                             )
# logReg.fit(df_X_train_res, df_y_train_res)

# y_hat = logReg.predict(df_X_res_test)

# print(f1_score(df_y_res_test, y_hat))

# arr = cross_val_score(logReg, df_X_train_res, df_y_train_res, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
# delinquency_CV = np.mean(arr)    
# print(f"the log scaled cross validation is {delinquency_CV}")
# np.std(arr)

# Tomek links standard.

### do you have to upsample or downsample your test?

# Remember this code block. It's important


In [26]:
X = df_log_train.drop(columns=['default'])
y = df_log_train[['default']]

# test trin split
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

#Instantiate the object
tl = TomekLinks()
# resample the data
X_res, y_res = tl.fit_resample(X_train.as_matrix(),y_train.as_matrix())

# conform back to data frames
df_X_res = pd.DataFrame(np.matrix(X_res))
df_y_res = pd.DataFrame(np.matrix(y_res))

# rename the columns
df_X_res.columns = X.columns
df_y_res = df_y_res.T
df_y_res.columns = ['default']

# join them back together
df_res=pd.concat([df_X_res, df_y_res],axis=1)

In [27]:
# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(df_X_res)
X_test_scaled = scalar.transform(X_test)

In [31]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=1000, 
                            l1_ratio=0.3
                            )

# fit to the downsampled, scaled data
logReg.fit(X_train_scaled, df_y_res)

# predict on the none sampled, scaled test data
y_hat = logReg.predict(X_test_scaled)

# f1 score
print(f1_score(y_test, y_hat))

# cross validation
arr = cross_val_score(logReg, X_train_scaled, df_y_res, cv=15, scoring='f1', verbose = 1, n_jobs=-1)
tomek_log_CV = np.mean(arr)    
print(f"log scaled, standard scaled cross validation with tomek links is {tomek_log_CV}")
np.std(arr)
np.mean(arr)

0.5570079883805374


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


log scaled, standard scaled cross validation with tomek links is 0.5546837465842007


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    8.0s finished


0.5546837465842007

# The current best model is log scaled, standard scaled, with tomek links and balanced

# Grid search  Random forest

In [32]:
forest = RandomForestClassifier(random_state=42)
forest_grid_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2,7),
    'min_samples_leaf': range(2,5,1),
    'max_features' : [5,'sqrt', 8,9,10]
}

In [33]:
forest_clf = GridSearchCV(forest, forest_grid_params, n_jobs=-1, cv=10, verbose = 1 )
forest_clf.fit(X_train_scaled, df_y_res)

Fitting 10 folds for each of 150 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 13.1min finished


GridSearchCV(cv=10,
             estimator=RandomForestClassifier(criterion='entropy', max_depth=4,
                                              max_features='sqrt',
                                              min_samples_leaf=2),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 7),
                         'max_features': [5, 'sqrt', 8, 9, 10],
                         'min_samples_leaf': range(2, 5)},
             verbose=1)

In [39]:
forest_clf.best_score_

0.8079501318731112

In [37]:
best_forest= forest_clf.best_estimator_
y_hat=best_forest.predict(X_test_scaled)

# f1 score
print(f1_score(y_test, y_hat))
arr = cross_val_score(best_forest, X_train_scaled, df_y_res, cv=15, scoring='f1', verbose = 1, n_jobs=-1)
np.mean(arr)

0.5047193243914555


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   14.8s finished


0.48514605414392337

### Random Forest performs worse than just logistic regression

# Let's try bagging using Logistic Regression

In [44]:
bagger = BaggingClassifier(base_estimator=logReg,
                           n_estimators = 30,
                           verbose=1, 
                           max_samples=0.66,
                           max_feature = 0.66)
bagger.fit(X_train_scaled, df_y_res)
y_hat = bagger.predict(X_test_scaled)
print(f1_score(y_test, y_hat))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.5595325054784513


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [45]:
arr = cross_val_score(bagger, X_train_scaled, df_y_res,  cv=15, scoring='f1', verbose = 1, n_jobs=-1 )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 13.3min finished


In [46]:
np.mean(arr)

0.5553649296974794

In [None]:

bagger = BaggingClassifier(base_estimator=logReg,
                           n_estimators = 30,
                           verbose=1, 
                           max_samples=0.9,
                           max_features = 0.8, oob_score=True)
bagger.fit(X_train_scaled, df_y_res)
y_hat_1 = bagger.predict(X_test_scaled)
print(f1_score(y_test, y_hat_1))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


# Grid search Log Reg

In [121]:
logReg_grid = LogisticRegression(penalty='elasticnet',
                                 solver='saga',
                                 verbose = 1, 
                                 n_jobs=-1,
                                 random_state=42 )
param_grid = {
    'C' : np.linspace(0.01, 1, 20),
    'l1_ratio' : np.linspace(0.01, 1, 20)
}

In [123]:
grid_clv = GridSearchCV(estimator=logReg_grid,
                        param_grid=param_grid,
                        scoring='f1',
                        n_jobs=-1, 
                        verbose=1, cv=10)
grid_clv.fit(X_train_scaled, y_train);

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 19.6min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 26.8min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 34.5min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 43.5min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 1 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.6s finished


In [126]:
# old params = {'C': 0.21842105263157896, 'l1_ratio': 0.8957894736842106}
grid_clv.best_params_

{'C': 0.6873684210526316, 'l1_ratio': 0.21842105263157896}

In [128]:
best_log_model = grid_clv.best_estimator_
best_log_model.fit(X_train_scaled, y_train)
y_hat = best_log_model.predict(X_test_scaled)
f1_score(y_test, y_hat)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


max_iter reached after 1 seconds


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.1s finished


0.49170437405731526

In [147]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.01,
                            solver='saga', 
                            penalty='elasticnet',
                            max_iter=1000, 
                            l1_ratio=0.5
                            )
arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
np.mean(arr)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.9s finished


0.5698911747001935

In [148]:
logReg.fit(X_train_scaled, y_train)
y_hat = logReg.predict(X_test_scaled)
f1_score(y_test, y_hat)

0.5363565285379204

In [142]:
# Save the logistic regression model
scalar_pickle_out = open("../logReg_tomek.pickle","wb")
pickle.dump(logReg, scalar_pickle_out)
scalar_pickle_out.close()

scalar_pickle_out = open("../tomek_link_logReg.pickle","wb")
pickle.dump(tl, scalar_pickle_out)
scalar_pickle_out.close()

# Grid search KNN

In [173]:
knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=5, n_jobs=-1, p=5)

knn_clf.fit(X_train_scaled, y_train)
y_hat = knn_clf.predict(X_test_scaled)
f1_score(y_test, y_hat)

0.46311858076563966

In [181]:
vote_clf = VotingClassifier(
    estimators= [
        ('lr',logReg),
        ('knn', knn_clf)
    ],
    voting='soft'
    )
vote_clf1 = vote_clf.fit(X_train_scaled, y_train)

In [182]:
y_hat = vote_clf1.predict(X_test_scaled)

In [183]:
f1_score(y_test, y_hat)

0.5171645118145342

# Grid search a Random Forest

In [3]:
# instantiate a forest object
forest = RandomForestClassifier(criterion = 'entropy', max_depth = 4, max_features = 'sqrt', min_samples_leaf= 2)
# generate cross val predictions
arr = cross_val_score(forest, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)

forest_1_CV = np.mean(arr)    

print(f"the random forest cross validation is {forest_1_CV}")

# creating our parameters to test
param_dict={
    'max_depth': range(1,10,1),
    'min_samples_leaf': range(1,6,1),
    'max_leaf_nodes': [None, 17,20,15],
    'min_samples_split': range(2,10, 1)
}


grid_tree=GridSearchCV(decision_tree,
                       param_grid = param_dict,
                       cv=10, 
                       scoring='f1',
                       verbose=1,
                       n_jobs=-1
                      )

# grid_tree.fit(X_train,y_train);

NameError: name 'X_train_scaled' is not defined

In [157]:
knn_clf = KNeighborsClassifier()

kNN_param_grid = {
    'weights': ['uniform', 'distance'],
    'n_neighbors': range(1, 10, 1),
    'n_jobs': [-1],
    'p' : [2,3,4]
}

knn_grid = GridSearchCV(knn_clf,
                        param_grid=kNN_param_grid,
                        scoring='f1',
                        n_jobs=-1,
                        verbose=1,
                        cv=10)

knn_grid.fit(X_train_scaled, y_train.default)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [186]:
df_res

Unnamed: 0,limit_bal,male,age,other,post_grad,university,married,deli,credit_utility,bills,...,pay_5_-1,pay_5_0,pay_5_5,pay_6_-2,pay_6_-1,pay_6_2,pay_6_7,limit_bal.1,age.1,default
0,220000.0,0.0,36.0,0.0,1.0,0.0,0.0,0.0,4.824600,1250323.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.301383,3.583519,1
1,200000.0,0.0,29.0,0.0,0.0,0.0,0.0,-6.0,0.000000,1956.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,12.206073,3.367296,0
2,180000.0,0.0,27.0,0.0,1.0,0.0,0.0,-12.0,0.000000,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,12.100712,3.295837,0
3,80000.0,1.0,32.0,0.0,0.0,1.0,0.0,0.0,3.373638,279502.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,11.289782,3.465736,0
4,10000.0,1.0,27.0,0.0,0.0,1.0,0.0,0.0,2.661000,31910.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9.210340,3.295837,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20404,40000.0,0.0,38.0,0.0,0.0,1.0,1.0,9.0,5.599450,234247.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,10.596635,3.637586,1
20405,350000.0,1.0,42.0,0.0,1.0,0.0,1.0,-6.0,-0.019574,24410.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,12.765688,3.737670,0
20406,100000.0,0.0,46.0,0.0,0.0,0.0,0.0,3.0,0.058820,24806.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,11.512925,3.828641,0
20407,20000.0,0.0,50.0,0.0,0.0,0.0,1.0,-8.0,0.227600,15502.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9.903488,3.912023,1
