In [118]:
# import libraries 

# utility/data wrangling
import pandas as pd
import numpy as np
from warnings import filterwarnings
import pickle 


# chart creations
import matplotlib.pyplot as plt
import seaborn as sns

# pre processing
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks # down sampling
from imblearn.over_sampling import SMOTE # up sampling


# model selection
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


# Model Validation 
from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score, roc_curve

# statistical testing
from scipy.stats import f_oneway
from scipy import stats

# # py file
# import src

filterwarnings('ignore')
%matplotlib inline

In [119]:
# import model
df_train = pd.read_csv('./data/forest_model_best.csv', index_col=0)

In [120]:
df_train.shape

(22499, 46)

# The goal is to get the holdout to this shape

In [149]:
holdout = pd.read_csv('./data/holdout_data.csv')

In [150]:
holdout.shape

(7501, 24)

In [151]:
holdout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7501 entries, 0 to 7500
Data columns (total 24 columns):
Unnamed: 0    7501 non-null int64
X1            7501 non-null int64
X2            7501 non-null int64
X3            7501 non-null int64
X4            7501 non-null int64
X5            7501 non-null int64
X6            7501 non-null int64
X7            7501 non-null int64
X8            7501 non-null int64
X9            7501 non-null int64
X10           7501 non-null int64
X11           7501 non-null int64
X12           7501 non-null int64
X13           7501 non-null int64
X14           7501 non-null int64
X15           7501 non-null int64
X16           7501 non-null int64
X17           7501 non-null int64
X18           7501 non-null int64
X19           7501 non-null int64
X20           7501 non-null int64
X21           7501 non-null int64
X22           7501 non-null int64
X23           7501 non-null int64
dtypes: int64(24)
memory usage: 1.4 MB


In [152]:
holdout.X2.value_counts()

2    4540
1    2961
Name: X2, dtype: int64

# Rename the column heads and drop the useless column

In [153]:
column_names = [
    'id',
    'limit_bal',
    'sex',
    'education',
    'marriage',
    'age',
    'pay_0',
    'pay_2',
    'pay_3',
    'pay_4',
    'pay_5',
    'pay_6',
    'bill_amt1',
    'bill_amt2',
    'bill_amt3',
    'bill_amt4',
    'bill_amt5',
    'bill_amt6',
    'pay_amt1',
    'pay_amt2',
    'pay_amt3',
    'pay_amt4',
    'pay_amt5',
    'pay_amt6',
]
holdout.columns = column_names

# First thing is to dummy the marriages and education columns

In [154]:
def marriage_categorization(n):
    if n == 1:
        return 'married'
    else: 
        return 'not_married'

mar_cat = holdout.marriage.apply(marriage_categorization)
mar_cat_dummies = pd.get_dummies(mar_cat)
holdout_1= holdout.drop(columns=['marriage'])
holdout_1 = pd.concat([holdout_1, mar_cat_dummies], axis=1)
holdout_1.drop(columns=['not_married'], inplace=True)
holdout_1.head()

Unnamed: 0,id,limit_bal,sex,education,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,married
0,5501,180000,2,2,44,0,0,0,0,0,...,174764,162667,166953,10000,8000,7000,6000,7000,10000,1
1,28857,130000,2,2,48,-2,-2,-2,-2,-2,...,1279,749,440,1240,1487,1279,749,440,849,1
2,11272,60000,2,1,43,-1,3,2,0,0,...,330,165,340,0,330,0,0,340,0,1
3,8206,240000,1,1,42,0,0,0,0,0,...,51508,51127,0,20000,2213,1030,1023,6790,10893,1
4,6362,100000,2,2,28,2,0,0,0,0,...,63924,57326,59654,3500,3003,1910,2400,3300,0,1


In [159]:
def education_categories(n):
    if n > 3 or n == 0:
        return 'other'
    elif n == 3:
        return 'high_school'
    elif n == 2:
        return 'university'
    elif n == 1:
        return 'post_grad'

    
edu_cat = holdout_1.education.apply(education_categories)
edu_cat_dummies = pd.get_dummies(edu_cat, drop_first=True)

holdout_2 = holdout_1.drop(columns = 'education')
holdout_2 =pd.concat([holdout_2, edu_cat_dummies], axis=1)

In [160]:
holdout_2

Unnamed: 0,id,limit_bal,sex,age,pay_0,pay_2,pay_3,pay_4,pay_5,pay_6,...,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,married,other,post_grad,university
0,5501,180000,2,44,0,0,0,0,0,0,...,10000,8000,7000,6000,7000,10000,1,0,0,1
1,28857,130000,2,48,-2,-2,-2,-2,-2,-2,...,1240,1487,1279,749,440,849,1,0,0,1
2,11272,60000,2,43,-1,3,2,0,0,-1,...,0,330,0,0,340,0,1,0,1,0
3,8206,240000,1,42,0,0,0,0,0,0,...,20000,2213,1030,1023,6790,10893,1,0,1,0
4,6362,100000,2,28,2,0,0,0,0,2,...,3500,3003,1910,2400,3300,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,14600,90000,2,34,-2,-2,-2,-2,-2,-2,...,1924,11855,10655,0,665,0,1,0,0,1
7497,12687,180000,2,28,0,0,0,0,0,0,...,6500,5000,5000,5000,5000,5000,0,0,0,1
7498,7374,360000,1,37,1,-2,-2,-2,-2,-2,...,0,0,0,0,0,0,1,0,0,1
7499,27661,50000,2,23,-1,0,0,2,0,0,...,1502,2651,500,500,500,500,0,0,0,1


# Dummy pay columns

In [161]:
pay = [
 'pay_0',
 'pay_2',
 'pay_3',
 'pay_4',
 'pay_5',
 'pay_6',
]

pay_list = [pd.get_dummies(holdout_2[column], prefix=(column)) for column in pay]
pay_list_dummies = pd.concat(pay_list, axis=1)
holdout_3 = pd.concat([holdout_2, pay_list_dummies], axis=1)
holdout_3.drop(columns=pay, inplace=True)
holdout_3.head()

Unnamed: 0,id,limit_bal,sex,age,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,...,pay_5_7,pay_6_-2,pay_6_-1,pay_6_0,pay_6_2,pay_6_3,pay_6_4,pay_6_5,pay_6_6,pay_6_7
0,5501,180000,2,44,161186,167080,170788,174764,162667,166953,...,0,0,0,1,0,0,0,0,0,0
1,28857,130000,2,48,0,1240,1487,1279,749,440,...,0,1,0,0,0,0,0,0,0,0
2,11272,60000,2,43,495,330,495,330,165,340,...,0,0,1,0,0,0,0,0,0,0
3,8206,240000,1,42,72339,91045,91027,51508,51127,0,...,0,0,0,1,0,0,0,0,0,0
4,6362,100000,2,28,73073,74739,70844,63924,57326,59654,...,0,0,0,0,1,0,0,0,0,0


In [162]:
holdout_3.columns.to_list()

['id',
 'limit_bal',
 'sex',
 'age',
 'bill_amt1',
 'bill_amt2',
 'bill_amt3',
 'bill_amt4',
 'bill_amt5',
 'bill_amt6',
 'pay_amt1',
 'pay_amt2',
 'pay_amt3',
 'pay_amt4',
 'pay_amt5',
 'pay_amt6',
 'married',
 'other',
 'post_grad',
 'university',
 'pay_0_-2',
 'pay_0_-1',
 'pay_0_0',
 'pay_0_1',
 'pay_0_2',
 'pay_0_3',
 'pay_0_4',
 'pay_0_5',
 'pay_0_6',
 'pay_0_7',
 'pay_0_8',
 'pay_2_-2',
 'pay_2_-1',
 'pay_2_0',
 'pay_2_1',
 'pay_2_2',
 'pay_2_3',
 'pay_2_4',
 'pay_2_5',
 'pay_2_6',
 'pay_2_7',
 'pay_3_-2',
 'pay_3_-1',
 'pay_3_0',
 'pay_3_2',
 'pay_3_3',
 'pay_3_4',
 'pay_3_5',
 'pay_3_6',
 'pay_3_7',
 'pay_3_8',
 'pay_4_-2',
 'pay_4_-1',
 'pay_4_0',
 'pay_4_2',
 'pay_4_3',
 'pay_4_4',
 'pay_4_5',
 'pay_4_7',
 'pay_5_-2',
 'pay_5_-1',
 'pay_5_0',
 'pay_5_2',
 'pay_5_3',
 'pay_5_4',
 'pay_5_5',
 'pay_5_6',
 'pay_5_7',
 'pay_6_-2',
 'pay_6_-1',
 'pay_6_0',
 'pay_6_2',
 'pay_6_3',
 'pay_6_4',
 'pay_6_5',
 'pay_6_6',
 'pay_6_7']

In [131]:
df_train.columns.to_list()

['limit_bal',
 'male',
 'age',
 'default',
 'other',
 'post_grad',
 'university',
 'married',
 'pay_0_-1',
 'pay_0_0',
 'pay_0_1',
 'pay_0_2',
 'pay_0_3',
 'pay_0_8',
 'pay_2_-2',
 'pay_2_0',
 'pay_2_1',
 'pay_2_2',
 'pay_2_3',
 'pay_2_7',
 'pay_3_-2',
 'pay_3_-1',
 'pay_3_0',
 'pay_3_1',
 'pay_3_5',
 'pay_3_7',
 'pay_4_-2',
 'pay_4_-1',
 'pay_4_0',
 'pay_4_1',
 'pay_4_5',
 'pay_5_-1',
 'pay_5_0',
 'pay_5_5',
 'pay_6_-2',
 'pay_6_-1',
 'pay_6_2',
 'pay_6_7',
 'pay_6_8',
 'deli',
 'credit_utility',
 'bills',
 'payment',
 'pay_outlier',
 'bill_outlier',
 'credit_outlier']

In [132]:
df_train_dummies = ['pay_0_-1',
 'pay_0_0',
 'pay_0_1',
 'pay_0_2',
 'pay_0_3',
 'pay_0_8',
 'pay_2_-2',
 'pay_2_0',
 'pay_2_1',
 'pay_2_2',
 'pay_2_3',
 'pay_2_7',
 'pay_3_-2',
 'pay_3_-1',
 'pay_3_0',
 'pay_3_1',
 'pay_3_5',
 'pay_3_7',
 'pay_4_-2',
 'pay_4_-1',
 'pay_4_0',
 'pay_4_1',
 'pay_4_5',
 'pay_5_-1',
 'pay_5_0',
 'pay_5_5',
 'pay_6_-2',
 'pay_6_-1',
 'pay_6_2',
 'pay_6_7',
 'pay_6_8',]

over_lap = [x for x in df_train_dummies if x in pay_list_dummies]

In [133]:
len(over_lap) , len(df_train_dummies)

(28, 31)

In [134]:
over_lap

['pay_0_-1',
 'pay_0_0',
 'pay_0_1',
 'pay_0_2',
 'pay_0_3',
 'pay_0_8',
 'pay_2_-2',
 'pay_2_0',
 'pay_2_1',
 'pay_2_2',
 'pay_2_3',
 'pay_2_7',
 'pay_3_-2',
 'pay_3_-1',
 'pay_3_0',
 'pay_3_5',
 'pay_3_7',
 'pay_4_-2',
 'pay_4_-1',
 'pay_4_0',
 'pay_4_5',
 'pay_5_-1',
 'pay_5_0',
 'pay_5_5',
 'pay_6_-2',
 'pay_6_-1',
 'pay_6_2',
 'pay_6_7']

In [135]:
df_train_pay_overlap = df_train[over_lap]

In [136]:
df_train = df_train.drop(columns=df_train_dummies)

In [137]:
df_train = pd.concat([df_train, df_train_pay_overlap],  axis =1)

In [138]:
df_train

Unnamed: 0,limit_bal,male,age,default,other,post_grad,university,married,deli,credit_utility,...,pay_4_-1,pay_4_0,pay_4_5,pay_5_-1,pay_5_0,pay_5_5,pay_6_-2,pay_6_-1,pay_6_2,pay_6_7
0,220000,0,36,1,0,1,0,0,0,4.824600,...,0,1,0,0,1,0,0,0,0,0
1,200000,0,29,0,0,0,0,0,-6,0.000000,...,1,0,0,1,0,0,0,1,0,0
2,180000,0,27,0,0,1,0,0,-12,0.000000,...,0,0,0,0,0,0,1,0,0,0
3,80000,1,32,0,0,0,1,0,0,3.373638,...,0,1,0,0,1,0,0,0,0,0
4,10000,1,27,1,0,0,1,0,0,2.661000,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22494,40000,0,38,1,0,0,1,1,9,5.599450,...,0,0,0,0,0,0,0,0,1,0
22495,350000,1,42,0,0,1,0,1,-6,-0.019574,...,1,0,0,1,0,0,0,1,0,0
22496,100000,0,46,0,0,0,0,0,3,0.058820,...,0,0,0,1,0,0,0,0,0,0
22497,20000,0,50,1,0,0,0,1,-8,0.227600,...,1,0,0,0,0,0,1,0,0,0


# Since we have lost 3 columns from our model. We need to refit our grid search for random forest

{'criterion': 'entropy',
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2}

In [66]:
forest_grid_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2,7),
    'min_samples_leaf': range(2,5,1),
    'max_features' : [5,'sqrt', 8,9,10]
}

In [73]:
forest = RandomForestClassifier(n_estimators = 100, class_weight='balanced', random_state=42, oob_score=True)

In [70]:
X = df_train.drop(columns=['default'])
y = df_train.default

# Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size = 0.1,
                                                    random_state = 42)

# normalize the data
scalar = StandardScaler()

X_train_scaled = pd.DataFrame(scalar.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scalar.transform(X_test))

X_trial = pd.concat([X_train_scaled, X_test_scaled])
y_trial = pd.concat([y_train, y_test])

logReg = LogisticRegression(class_weight = 'balanced',
                            C=1, solver='saga',
                            penalty='elasticnet',
                            max_iter=100,l1_ratio=0.5)

logReg.fit(X_trial,y_trial)
arr = cross_val_score(logReg, X_trial, y_trial, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
df_11_CV = np.mean(arr)    
print(f"the df_11 cross validation is {df_11_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the df_11 cross validation is 0.5375042400546681


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.7s finished


In [80]:
forest = RandomForestClassifier(criterion = 'entropy', max_depth = 4, max_features = 'sqrt', min_samples_leaf= 2)
arr = cross_val_score(forest, X_trial, y_trial, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
forest_1_CV = np.mean(arr)    
print(f"the random forest cross validation is {forest_1_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the random forest cross validation is 0.411933505620674


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.1s finished


In [75]:
forest_grid_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(2,7),
    'min_samples_leaf': range(2,5,1),
    'max_features' : [5,'sqrt', 8,9,10]
}

In [77]:
forest_clf = GridSearchCV(forest, forest_grid_params, n_jobs=-1, cv=10, verbose = 1 )
forest_clf.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 150 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   27.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 19.3min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 24.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished


GridSearchCV(cv=10,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              max_depth=3, min_samples_leaf=3,
                                              verbose=1),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 7),
                         'max_features': [5, 'sqrt', 8, 9, 10],
                         'min_samples_leaf': range(2, 5)},
             verbose=1)

In [86]:
best_model = forest_clf.best_estimator_
best_model.fit(X_train_scaled, y_train)
y_hat = best_model.predict(X_train)
f1_score(y_train, y_hat)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


0.4803076515051054

In [221]:
arr = cross_val_score(best_model, X_train_scaled, y_train, cv=10, scoring='f1', verbose = 1, n_jobs=-1)
forest_1_CV = np.mean(arr)    
print(f"the random forest cross validation is {forest_1_CV}")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


the random forest cross validation is 0.5282443797584213


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.2s finished


In [88]:
arr

array([0.53507014, 0.51844844, 0.54882812, 0.55533981, 0.50671441,
       0.51668255, 0.53839205, 0.51650485, 0.53463415, 0.53445065])

In [90]:
pickle_out = open("./forest_model.pickle","wb")
pickle.dump(forest_clf.best_estimator_, pickle_out)
pickle_out.close()

# Ok, back to making the columns for our holdout set

In [164]:
holdout_3

Unnamed: 0,id,limit_bal,sex,age,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,...,pay_5_7,pay_6_-2,pay_6_-1,pay_6_0,pay_6_2,pay_6_3,pay_6_4,pay_6_5,pay_6_6,pay_6_7
0,5501,180000,2,44,161186,167080,170788,174764,162667,166953,...,0,0,0,1,0,0,0,0,0,0
1,28857,130000,2,48,0,1240,1487,1279,749,440,...,0,1,0,0,0,0,0,0,0,0
2,11272,60000,2,43,495,330,495,330,165,340,...,0,0,1,0,0,0,0,0,0,0
3,8206,240000,1,42,72339,91045,91027,51508,51127,0,...,0,0,0,1,0,0,0,0,0,0
4,6362,100000,2,28,73073,74739,70844,63924,57326,59654,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,14600,90000,2,34,1905,1924,11855,665,0,665,...,0,1,0,0,0,0,0,0,0,0
7497,12687,180000,2,28,103203,108032,109741,112907,115924,118832,...,0,0,0,1,0,0,0,0,0,0
7498,7374,360000,1,37,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7499,27661,50000,2,23,9101,10446,12595,11449,9914,9875,...,0,0,0,1,0,0,0,0,0,0


# Bill and payment column on the holdout

In [168]:
bills = [
    'bill_amt1',
    'bill_amt2',
    'bill_amt3',
    'bill_amt4',
    'bill_amt5',
    'bill_amt6'
]

payments = [
     'pay_amt1',
     'pay_amt2',
     'pay_amt3',
     'pay_amt4',
     'pay_amt5',
     'pay_amt6',
]

billing = holdout_3[bills].sum(axis=1)
paying = holdout_3[payments].sum(axis=1)
account_balance = billing - paying
credit_utility = account_balance/holdout_3.limit_bal

holdout_4 = holdout_3.copy()

holdout_4['credit_utility'] = credit_utility
holdout_4['bills'] = billing
holdout_4['payment'] = paying
holdout_4.drop(columns=payments+bills, inplace=True)
holdout_4

Unnamed: 0,id,limit_bal,sex,age,married,other,post_grad,university,pay_0_-2,pay_0_-1,...,pay_6_0,pay_6_2,pay_6_3,pay_6_4,pay_6_5,pay_6_6,pay_6_7,credit_utility,bills,payment
0,5501,180000,2,44,1,0,0,1,0,0,...,1,0,0,0,0,0,0,5.307989,1003438,48000
1,28857,130000,2,48,1,0,0,1,1,0,...,0,0,0,0,0,0,0,-0.006531,5195,6044
2,11272,60000,2,43,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0.024750,2155,670
3,8206,240000,1,42,1,0,1,0,0,0,...,1,0,0,0,0,0,0,1.312904,357046,41949
4,6362,100000,2,28,1,0,0,1,0,0,...,0,1,0,0,0,0,0,3.854470,399560,14113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7496,14600,90000,2,34,1,0,0,1,1,0,...,0,0,0,0,0,0,0,-0.089833,17014,25099
7497,12687,180000,2,28,0,0,0,1,0,0,...,1,0,0,0,0,0,0,3.539661,668639,31500
7498,7374,360000,1,37,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0.000000,0,0
7499,27661,50000,2,23,0,0,0,1,0,1,...,1,0,0,0,0,0,0,1.144540,63380,6153


In [170]:
credit_u = holdout_4.credit_utility
payment = holdout_4.payment
bill = holdout_4.bills

arr_payment = [1 if x > 5 or x < -5 else 0 for x in payment]
arr_credit_u = [1 if x > 5 or x < -5 else 0 for x in credit_u]
arr_bill = [1 if x > 5 or x < -5 else 0 for x in bill]

# print(len(arr_payment), len(arr_credit_u), len(arr_bill))
# outliers=[]
# for i in range(len(df_10)):
#     outliers.append(arr_payment[i] + arr_credit_u[i] + arr_bill[i])

# outliers = [1 if x > 0  else 0 for x in outliers]
# len(outliers)  - sum(outliers)
holdout_5 = holdout_4.copy()
holdout_5['pay_outlier'] = arr_payment
holdout_5['bill_outlier']= arr_bill
holdout_5['credit_outlier'] = arr_credit_u

# Need to dummy the male column and make a delinquency column

In [181]:
# dummy the males
def male_dummy(n):
    if n == 2:
        return 0
    else:
        return n
holdout_6 = holdout_5.copy()
holdout_6.sex = holdout_5.sex.apply(male_dummy)

# holdout_6.columns = holdout_5.columns[]
new_columns = list(holdout_5.columns)
new_columns[2] = 'male'
new_columns[2]

holdout_6.columns = new_columns

# Make a delinquency column
holdout



In [186]:
delinquency = holdout[pay].sum(axis=1)

In [190]:
holdout_7 = holdout_6.copy()
holdout_7['deli'] = delinquency

In [193]:
holdout_final = holdout_7[X.columns]

NameError: name 'holdout_final' is not defined

In [196]:
holdout_final_scaled = scalar.transform(holdout_final)

In [209]:
predictions = pd.DataFrame(best_model.predict(holdout_final_scaled), columns=['y_hat'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


In [211]:
predictions.y_hat.value_counts()

0    5697
1    1804
Name: y_hat, dtype: int64

In [212]:
5697/(5697+1804)

0.7594987335021997

In [213]:
df_train.default.value_counts()

0    17471
1     5028
Name: default, dtype: int64

In [214]:
17471/(17471+5028)

0.7765234010400462

In [218]:
predictions.to_csv('./data/tim_hintz.csv')

In [219]:
predictions

Unnamed: 0,y_hat
0,0
1,0
2,1
3,0
4,1
...,...
7496,0
7497,0
7498,0
7499,1


In [217]:
df_train

Unnamed: 0,limit_bal,male,age,default,other,post_grad,university,married,deli,credit_utility,...,pay_4_-1,pay_4_0,pay_4_5,pay_5_-1,pay_5_0,pay_5_5,pay_6_-2,pay_6_-1,pay_6_2,pay_6_7
0,220000,0,36,1,0,1,0,0,0,4.824600,...,0,1,0,0,1,0,0,0,0,0
1,200000,0,29,0,0,0,0,0,-6,0.000000,...,1,0,0,1,0,0,0,1,0,0
2,180000,0,27,0,0,1,0,0,-12,0.000000,...,0,0,0,0,0,0,1,0,0,0
3,80000,1,32,0,0,0,1,0,0,3.373638,...,0,1,0,0,1,0,0,0,0,0
4,10000,1,27,1,0,0,1,0,0,2.661000,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22494,40000,0,38,1,0,0,1,1,9,5.599450,...,0,0,0,0,0,0,0,0,1,0
22495,350000,1,42,0,0,1,0,1,-6,-0.019574,...,1,0,0,1,0,0,0,1,0,0
22496,100000,0,46,0,0,0,0,0,3,0.058820,...,0,0,0,1,0,0,0,0,0,0
22497,20000,0,50,1,0,0,0,1,-8,0.227600,...,1,0,0,0,0,0,1,0,0,0


In [225]:
df_train.to_csv('./data/holdout_compatible_train.csv', index=False)
pd.DataFrame(holdout_final_scaled, columns = holdout_final.columns).to_csv('./data/holdout_final.csv', index=False)

