In [2]:
# import libraries 

# utility/data wrangling
import pandas as pd
import numpy as np
from warnings import filterwarnings

# chart creations
import matplotlib.pyplot as plt
import seaborn as sns

# pre processing
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks # down sampling
from imblearn.over_sampling import SMOTE # up sampling


# model selection
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


# Model Validation 
from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score, roc_curve

# statistical testing
from scipy.stats import f_oneway
from scipy import stats

# # py file
# import src

filterwarnings('ignore')
%matplotlib inline

In [3]:
df = pd.read_csv('./data/training_cleaning.csv')
df.drop(columns=['id'], inplace=True)

# The plan here is to dummy everthing that can be dummied

- pay 
- education
- marriage


In [4]:
df.head()

Unnamed: 0,limit_bal,male,education,marriage,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default
0,220000,0,1,2,36,0,0,0,0,0,...,221193,181859,184605,10000,8018,10121,6006,10987,143779,1
1,200000,0,3,2,29,-1,-1,-1,-1,-1,...,326,326,326,326,326,326,326,326,326,0
2,180000,0,1,2,27,-2,-2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
3,80000,1,2,2,32,0,0,0,0,0,...,43882,42256,42527,1853,1700,1522,1548,1488,1500,0
4,10000,1,2,2,27,0,0,0,0,0,...,5444,2639,2697,2000,1100,600,300,300,1000,1


# Education

In [5]:
df.education.value_counts()

2    10516
1     7919
3     3713
5      208
4       90
6       42
0       11
Name: education, dtype: int64

In [6]:
def education_categories(n):
    if n > 3 or n == 0:
        return 'other'
    elif n == 3:
        return 'high_school'
    elif n == 2:
        return 'university'
    elif n == 1:
        return 'post_grad'

    
edu_cat = df.education.apply(education_categories)
edu_cat_dummies = pd.get_dummies(edu_cat, drop_first=True)
edu_cat_dummies
df_1 = df.drop(columns = 'education')
df_1 =pd.concat([df_1, edu_cat_dummies], axis=1)

# Marriage

In [7]:
def marriage_categorization(n):
    if n == 1:
        return 'married'
    else: 
        return 'not_married'

mar_cat = df_1.marriage.apply(marriage_categorization)
mar_cat_dummies = pd.get_dummies(mar_cat)
df_2= df_1.drop(columns=['marriage'])
df_2 = pd.concat([df_2, mar_cat_dummies], axis=1)
df_2.drop(columns=['not_married'], inplace=True)
df_2.head()

Unnamed: 0,limit_bal,male,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,bill_amt1,...,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default,other,post_grad,university,married
0,220000,0,36,0,0,0,0,0,0,222598,...,8018,10121,6006,10987,143779,1,0,1,0,0
1,200000,0,29,-1,-1,-1,-1,-1,-1,326,...,326,326,326,326,326,0,0,0,0,0
2,180000,0,27,-2,-2,-2,-2,-2,-2,0,...,0,0,0,0,0,0,0,1,0,0
3,80000,1,32,0,0,0,0,0,0,51372,...,1700,1522,1548,1488,1500,0,0,0,1,0
4,10000,1,27,0,0,0,0,0,0,8257,...,1100,600,300,300,1000,1,0,0,1,0


# pay

In [8]:
pay = [
 'pay_0',
 'pay_2',
 'pay_3',
 'pay_4',
 'pay_5',
 'pay_6',
]

pay_list = [pd.get_dummies(df[column], prefix=(column)) for column in pay]
pay_list_dummies = pd.concat(pay_list, axis=1)
df_3 = pd.concat([df_2, pay_list_dummies], axis=1)
df_3.drop(columns=pay, inplace=True)
df_3.head()

Unnamed: 0,limit_bal,male,age,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,pay_amt1,...,pay_6_-2,pay_6_-1,pay_6_0,pay_6_2,pay_6_3,pay_6_4,pay_6_5,pay_6_6,pay_6_7,pay_6_8
0,220000,0,36,222598,222168,217900,221193,181859,184605,10000,...,0,0,1,0,0,0,0,0,0,0
1,200000,0,29,326,326,326,326,326,326,326,...,0,1,0,0,0,0,0,0,0,0
2,180000,0,27,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,80000,1,32,51372,51872,47593,43882,42256,42527,1853,...,0,0,1,0,0,0,0,0,0,0
4,10000,1,27,8257,7995,4878,5444,2639,2697,2000,...,0,0,1,0,0,0,0,0,0,0


In [10]:
pay_list_dummies.shape

(22499, 64)

# Fit a model

In [8]:
# split the data

X = df_3.drop(columns=['default'])
y = df_3.default



# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 83), (5625, 83), (16874,), (5625,))

In [9]:
logReg = LogisticRegression(class_weight = 'balanced')

logReg.fit(X_train_scaled, y_train)
y_hat = logReg.predict(X_test_scaled)
f1_score(y_test, y_hat)
arr = cross_val_score(logReg,X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
baseline_CV = np.mean(arr)    
print(f"the baseline cross validation is {baseline_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the baseline cross validation is 0.5253272780856306


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.4s finished


# Add a delinquency column

In [10]:
delinquency = df[pay].sum(axis=1)
df_4 = df_3.copy()
df_4['deli'] = delinquency

In [11]:
# split the data

X = df_4.drop(columns=['default'])
y = df_4.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 84), (5625, 84), (16874,), (5625,))

In [12]:
logReg = LogisticRegression(class_weight = 'balanced', C=0.1, solver='saga', penalty='l1', max_iter=100)

logReg.fit(X_train_scaled, y_train)
y_hat = logReg.predict(X_test_scaled)
f1_score(y_test, y_hat)
arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
delinquency_CV = np.mean(arr)    
print(f"the delinquency cross validation is {delinquency_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the delinquency cross validation is 0.5279737843523791


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   12.7s finished


# Credit utilization score

In [13]:
bills = [
    'bill_amt1',
    'bill_amt2',
    'bill_amt3',
    'bill_amt4',
    'bill_amt5',
    'bill_amt6'
]

payments = [
     'pay_amt1',
     'pay_amt2',
     'pay_amt3',
     'pay_amt4',
     'pay_amt5',
     'pay_amt6',
]

billing = df[bills].sum(axis=1)
paying = df[payments].sum(axis=1)
account_balance = billing - paying
credit_utility = account_balance/df.limit_bal
df_5 = df_4.copy()
df_5['credit_utility'] = credit_utility
df_5['bills'] = billing
df_5['payment'] = paying
df_5.drop(columns=payments+bills, inplace=True)

In [14]:
X = df_5.drop(columns=['default'])
y = df_5.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 75), (5625, 75), (16874,), (5625,))

In [15]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.1, solver='saga',
                            penalty='elasticnet',
                            max_iter=100,l1_ratio=1)

logReg.fit(X_train_scaled, y_train)

arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
credit_CV = np.mean(arr)    
print(f"the credit cross validation is {credit_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the credit cross validation is 0.5273731221460495


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.3s finished


In [16]:
droppable = [x[0] for x in (zip(df_5.columns.to_list(), logReg.coef_[0])) if x[1] == 0.0 ]

In [17]:
df_6 = df_5.drop(columns=droppable)

In [18]:
X = df_6.drop(columns=['default'])
y = df_6.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 67), (5625, 67), (16874,), (5625,))

In [19]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.1, solver='saga',
                            penalty='elasticnet',
                            max_iter=100,l1_ratio=1)

logReg.fit(X_train_scaled, y_train)

arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
regularized_CV = np.mean(arr)    
print(f"the regularized cross validation is {regularized_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the regularized cross validation is 0.5274531207818756


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.7s finished


In [20]:
logReg.coef_

array([[-0.20514925,  0.0726961 ,  0.01025971, -0.08881623, -0.01628793,
         0.0103487 ,  0.08027158, -0.11853044,  0.02264333, -0.29711195,
         0.065846  ,  0.42895222,  0.14325178,  0.04578842,  0.0356991 ,
        -0.02303592,  0.        ,  0.04223354, -0.02618457, -0.00097304,
        -0.05234305,  0.03002817,  0.01501038, -0.0524472 ,  0.01565564,
         0.05179493,  0.00498681,  0.        , -0.04565155, -0.02444971,
        -0.00199874,  0.09853537, -0.01067634, -0.02226729,  0.        ,
         0.        , -0.0262301 , -0.00817023, -0.06957857, -0.11665863,
         0.03221561, -0.00615438,  0.01064837, -0.03527543, -0.01553757,
        -0.03509629,  0.        , -0.04774308, -0.07468749,  0.        ,
        -0.00439452,  0.00194106, -0.00420842,  0.02293234,  0.00980277,
        -0.00434436, -0.16721126,  0.00959031, -0.00974594, -0.00781664,
         0.        ,  0.        ,  0.02529837,  0.1998704 ,  0.15005424,
         0.07279146, -0.18703371]])

In [21]:
droppable_2 = [x[0] for x in (zip(df_5.columns.to_list(), logReg.coef_[0])) if x[1] == 0.0 ]

In [22]:
droppable_2

['pay_0_6',
 'pay_2_6',
 'pay_3_2',
 'pay_3_3',
 'pay_4_3',
 'pay_4_6',
 'pay_5_7',
 'pay_5_8']

In [23]:
droppable_2 = ['pay_0_6',
 'pay_2_6',
 'pay_3_2',
 'pay_4_3',
 'pay_5_7',
 'pay_5_8']
df_7 = df_6.drop(columns=droppable_2)

In [24]:
X = df_7.drop(columns=['default'])
y = df_7.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 61), (5625, 61), (16874,), (5625,))

In [25]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.1, solver='saga',
                            penalty='elasticnet',
                            max_iter=100,l1_ratio=1)

logReg.fit(X_train_scaled, y_train)

arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
regularized_CV = np.mean(arr)    
print(f"the df_7 cross validation is {regularized_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the df_7 cross validation is 0.5283509129201117


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.5s finished


In [26]:
droppable_3 = [x[0] for x in (zip(df_7.columns.to_list(), logReg.coef_[0])) if x[1] == 0.0 ]

In [27]:
df_8 = df_7.drop(columns=droppable_3)

In [28]:
X = df_8.drop(columns=['default'])
y = df_8.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 52), (5625, 52), (16874,), (5625,))

In [29]:
df_8.head()

Unnamed: 0,limit_bal,male,age,default,other,post_grad,university,married,pay_0_-2,pay_0_-1,...,pay_6_-1,pay_6_0,pay_6_2,pay_6_4,pay_6_7,pay_6_8,deli,credit_utility,bills,payment
0,220000,0,36,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,4.8246,1250323,188911
1,200000,0,29,0,0,0,0,0,0,1,...,1,0,0,0,0,0,-6,0.0,1956,1956
2,180000,0,27,0,0,1,0,0,1,0,...,0,0,0,0,0,0,-12,0.0,0,0
3,80000,1,32,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,3.373638,279502,9611
4,10000,1,27,1,0,0,1,0,0,0,...,0,1,0,0,0,0,0,2.661,31910,5300


In [30]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.1, solver='saga',
                            penalty='elasticnet',
                            max_iter=100,l1_ratio=1)

logReg.fit(X_train_scaled, y_train)

arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
df_8_CV = np.mean(arr)    
print(f"the df_7 cross validation is {df_8_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the df_7 cross validation is 0.529223520871714


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.0s finished


In [31]:
droppable_4 = [x[0] for x in (zip(df_8.columns.to_list(), logReg.coef_[0])) if x[1] == 0.0 ]

In [32]:
droppable_4

['pay_0_4', 'pay_4_7', 'pay_5_3', 'pay_6_0', 'pay_6_4']

In [33]:
df_9 = df_8.drop(columns=droppable_4)

In [34]:
X = df_9.drop(columns=['default'])
y = df_9.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 47), (5625, 47), (16874,), (5625,))

In [35]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=0.1, solver='saga',
                            penalty='elasticnet',
                            max_iter=100,l1_ratio=1)

logReg.fit(X_train_scaled, y_train)

arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
df_8_CV = np.mean(arr)    
print(f"the df_9 cross validation is {df_8_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the df_9 cross validation is 0.5294175044823638


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.2s finished


In [36]:
logReg.coef_

array([[-2.08805110e-01,  7.20741748e-02,  1.02511326e-02,
        -8.84288857e-02, -1.65946066e-02,  9.90119543e-03,
         8.02296042e-02, -1.36857606e-01,  0.00000000e+00,
        -3.19842206e-01,  3.19819693e-02,  4.03172270e-01,
         1.25408003e-01, -4.87410349e-04,  6.34945118e-02,
         1.07591552e-07, -5.10518236e-02,  4.43607134e-02,
         3.57425474e-02,  2.73251905e-03, -5.77914586e-02,
        -1.33398702e-01, -1.40938003e-01, -5.26365169e-03,
        -3.33099581e-02, -6.85932384e-03,  0.00000000e+00,
        -1.29087550e-02,  0.00000000e+00, -5.89987834e-02,
        -1.01807861e-01,  3.24592874e-02,  1.50009810e-02,
        -4.23256810e-02,  0.00000000e+00, -4.14595374e-02,
        -8.13574288e-02,  0.00000000e+00,  1.29219052e-01,
         1.08701022e-01,  9.23382450e-02, -5.82647717e-03,
         1.11948324e-02,  2.54769482e-01,  1.42397335e-01,
         7.81238752e-02, -1.89909593e-01]])

In [37]:
droppable_5 = [x[0] for x in (zip(df_8.columns.to_list(), logReg.coef_[0])) if x[1] == 0.0 ]

In [38]:
droppable_5

['pay_0_-2', 'pay_3_4', 'pay_3_6', 'pay_4_4', 'pay_5_-2']

In [39]:
df_10 = df_9.drop(columns=droppable_5)

In [40]:
X = df_10.drop(columns=['default'])
y = df_10.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16874, 42), (5625, 42), (16874,), (5625,))

In [41]:
logReg = LogisticRegression(class_weight = 'balanced',
                            C=1, solver='saga',
                            penalty='elasticnet',
                            max_iter=100,l1_ratio=0.5)

logReg.fit(X_train_scaled, y_train)

arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
df_10_CV = np.mean(arr)    
print(f"the df_9 cross validation is {df_10_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the df_9 cross validation is 0.5281629075597395


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.2s finished


# I have removed all the features that I can without lowering my f1. 

## Now I will remove outliers. Outlier being 5x std away from the mean

In [42]:
scalar_10 = StandardScaler()
df_10_scaled = scalar_10.fit_transform(df_10)

In [43]:
x = df_10.credit_utility

conditions = [
    -5 < x,
    x < 5,
    x<-5,
    x>5
]

choices = [
    x,
    x,
    np.nan,
    np.nan
]

len([x for x in list(np.select(conditions,choices)) if x ==0])

1063

In [44]:
df_10[df_10.credit_utility > 5].groupby(['default']).male.count()

default
0    1600
1     777
Name: male, dtype: int64

In [45]:
credit_u = df_10.credit_utility
payment = df_10.payment
bill = df_10.bills

arr_payment = [1 if x > 5 or x < -5 else 0 for x in payment]
arr_credit_u = [1 if x > 5 or x < -5 else 0 for x in credit_u]
arr_bill = [1 if x > 5 or x < -5 else 0 for x in bill]

# print(len(arr_payment), len(arr_credit_u), len(arr_bill))
# outliers=[]
# for i in range(len(df_10)):
#     outliers.append(arr_payment[i] + arr_credit_u[i] + arr_bill[i])

# outliers = [1 if x > 0  else 0 for x in outliers]
# len(outliers)  - sum(outliers)
df_11 = df_10.copy()
df_11['pay_outlier'] = arr_payment
df_11['bill_outlier']= arr_bill
df_11['credit_outlier'] = arr_credit_u

In [46]:
df_11.groupby(['credit_outlier']).male.count()

credit_outlier
0    20121
1     2378
Name: male, dtype: int64

In [47]:
X = df_11.drop(columns=['default'])
y = df_11.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

logReg = LogisticRegression(class_weight = 'balanced',
                            C=1, solver='saga',
                            penalty='elasticnet',
                            max_iter=100,l1_ratio=0.5)

logReg.fit(X_train_scaled, y_train)

arr = cross_val_score(logReg, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
df_11_CV = np.mean(arr)    
print(f"the df_11 cross validation is {df_11_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the df_11 cross validation is 0.5313213653734211


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.8s finished


In [48]:
droppable_6 = [x[0] for x in (zip(df_11.columns.to_list(), logReg.coef_[0])) if x[1] == 0.0 ]
droppable_6

[]

In [49]:
len(logReg.coef_[0])

45

Bill = 0.5311
all_3 outlier = 0.5314

# All three outliers seems to help the model. Let's run it on the whole data set

In [50]:
X = df_11.drop(columns=['default'])
y = df_11.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.25,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = pd.DataFrame(scalar.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scalar.transform(X_test))

X_trial = pd.concat([X_train_scaled, X_test_scaled])
y_trial = pd.concat([y_train, y_test])

logReg = LogisticRegression(class_weight = 'balanced',
                            C=1, solver='saga',
                            penalty='elasticnet',
                            max_iter=100,l1_ratio=0.5)

logReg.fit(X_trial,y_trial)
arr = cross_val_score(logReg, X_trial, y_trial, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
df_11_CV = np.mean(arr)    
print(f"the df_11 cross validation is {df_11_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the df_11 cross validation is 0.5387257793592406


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.1s finished


In [51]:
sorted(list(zip(df_11.columns, logReg.coef_[0])), reverse=True, key=lambda x:x[1])

[('pay_0_1', 0.43691749787429013),
 ('pay_6_8', 0.28095489571661586),
 ('deli', 0.1503876000984254),
 ('pay_6_-2', 0.1428049382345453),
 ('pay_0_2', 0.1397165331595493),
 ('pay_5_5', 0.11982455182399866),
 ('credit_utility', 0.1017033150727708),
 ('pay_6_-1', 0.08657350711793174),
 ('pay_0_0', 0.08109651166318059),
 ('university', 0.07604654012378693),
 ('male', 0.07264710265803312),
 ('pay_2_1', 0.06728257352277682),
 ('pay_2_-2', 0.06292320127767516),
 ('married', 0.054909952356374296),
 ('pay_2_2', 0.045517488824417376),
 ('pay_2_3', 0.04070874204273857),
 ('pay_6_7', 0.03798266301989302),
 ('pay_4_0', 0.03523656258272687),
 ('age', 0.02603554176451057),
 ('post_grad', 0.023132039354851103),
 ('pay_0_8', 0.0007971392250206325),
 ('other', -0.00011425250207022126),
 ('pay_5_0', -0.002916000380264796),
 ('pay_3_1', -0.0030918918739384314),
 ('pay_3_5', -0.01797326914591321),
 ('pay_3_0', -0.02739332883515202),
 ('pay_6_2', -0.02837535170040876),
 ('pay_0_3', -0.03116764463280971),
 ('

In [52]:
forest = RandomForestClassifier(max_depth=3, verbose=1, class_weight='balanced', min_samples_leaf=3)
arr = cross_val_score(forest, X_trial, y_trial, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
forest_1_CV = np.mean(arr)    
print(f"the random forest cross validation is {forest_1_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the random forest cross validation is 0.5321758880813094


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.8s finished


# Next step is to fit a KNN

In [53]:
KNN=KNeighborsClassifier( n_jobs=-1)
arr = cross_val_score(KNN, X_trial, y_trial, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
forest_1_CV = np.mean(arr)    
print(f"the random forest cross validation is {forest_1_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the random forest cross validation is 0.45341210526550635


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   32.0s finished


0        0
1        0
2        0
3        0
4        0
        ..
22494    0
22495    0
22496    0
22497    0
22498    0
Name: other, Length: 22499, dtype: uint8

# Let's grid search df_11 in random forest and logistic regression and knn

In [57]:
df_11

Unnamed: 0,limit_bal,male,age,default,other,post_grad,university,married,pay_0_-1,pay_0_0,...,pay_6_2,pay_6_7,pay_6_8,deli,credit_utility,bills,payment,pay_outlier,bill_outlier,credit_outlier
0,220000,0,36,1,0,1,0,0,0,1,...,0,0,0,0,4.824600,1250323,188911,1,1,0
1,200000,0,29,0,0,0,0,0,1,0,...,0,0,0,-6,0.000000,1956,1956,1,1,0
2,180000,0,27,0,0,1,0,0,0,0,...,0,0,0,-12,0.000000,0,0,0,0,0
3,80000,1,32,0,0,0,1,0,0,1,...,0,0,0,0,3.373638,279502,9611,1,1,0
4,10000,1,27,1,0,0,1,0,0,1,...,0,0,0,0,2.661000,31910,5300,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22494,40000,0,38,1,0,0,1,1,0,1,...,1,0,0,9,5.599450,234247,10269,1,1,1
22495,350000,1,42,0,0,1,0,1,1,0,...,0,0,0,-6,-0.019574,24410,31261,1,1,0
22496,100000,0,46,0,0,0,0,0,0,0,...,0,0,0,3,0.058820,24806,18924,1,1,0
22497,20000,0,50,1,0,0,0,1,1,0,...,0,0,0,-8,0.227600,15502,10950,1,1,0


In [59]:
46**0.5

6.782329983125268

In [81]:
forest = RandomForestClassifier(n_estimators = 100, class_weight='balanced', random_state=42)

In [82]:
forest_grid_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2,5),
    'min_samples_leaf': range(2,10,2),
    'max_features' : [5, 'sqrt', 10, 15, 20, 30]
}

In [83]:
forest_clf = GridSearchCV(forest, forest_grid_params, n_jobs=-1, cv=10, verbose = 1 )

In [84]:
forest_clf.fit(X_train, y_train)

Fitting 10 folds for each of 144 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed: 18.0min finished


GridSearchCV(cv=10,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              random_state=42),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 5),
                         'max_features': [5, 'sqrt', 10, 15, 20, 30],
                         'min_samples_leaf': range(2, 10, 2)},
             verbose=1)

In [100]:
forest_clf.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2}

In [90]:
best_model = forest_clf.best_estimator_

In [91]:
arr = cross_val_score(best_model, X_trial, y_trial, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
forest_2_CV = np.mean(arr)    
forest_2_CV

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    8.5s finished


0.5365410174467551

In [97]:
best_model.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=4, max_features='sqrt', min_samples_leaf=2,
                       random_state=42)

In [98]:
y_hat = best_model.predict(X_test)

In [99]:
f1_score(y_test, y_hat)

0.560966678872208

In [101]:
import pickle


In [102]:
pickle_out = open("./forest_model.pickle","wb")
pickle.dump(forest_clf.best_estimator_, pickle_out)
pickle_out.close()

In [103]:
df_11.to_csv('./data/forest_model_best.csv')

In [104]:
df_11.columns

Index(['limit_bal', 'male', 'age', 'default', 'other', 'post_grad',
       'university', 'married', 'pay_0_-1', 'pay_0_0', 'pay_0_1', 'pay_0_2',
       'pay_0_3', 'pay_0_8', 'pay_2_-2', 'pay_2_0', 'pay_2_1', 'pay_2_2',
       'pay_2_3', 'pay_2_7', 'pay_3_-2', 'pay_3_-1', 'pay_3_0', 'pay_3_1',
       'pay_3_5', 'pay_3_7', 'pay_4_-2', 'pay_4_-1', 'pay_4_0', 'pay_4_1',
       'pay_4_5', 'pay_5_-1', 'pay_5_0', 'pay_5_5', 'pay_6_-2', 'pay_6_-1',
       'pay_6_2', 'pay_6_7', 'pay_6_8', 'deli', 'credit_utility', 'bills',
       'payment', 'pay_outlier', 'bill_outlier', 'credit_outlier'],
      dtype='object')

In [None]:
for i in souplist
    