In [66]:
import pandas as pd
import numpy as np
import patsy
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# statsmodels issue: https://github.com/statsmodels/statsmodels/issues/3931
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

from ISLP import load_data

In [171]:
Default = load_data("Default")

In [172]:
Default['default'] = (Default['default'] == "Yes").astype(int)
Default

Unnamed: 0,default,student,balance,income
0,0,No,729.526495,44361.625074
1,0,Yes,817.180407,12106.134700
2,0,No,1073.549164,31767.138947
3,0,No,529.250605,35704.493935
4,0,No,785.655883,38463.495879
...,...,...,...,...
9995,0,No,711.555020,52992.378914
9996,0,No,757.962918,19660.721768
9997,0,No,845.411989,58636.156984
9998,0,No,1569.009053,36669.112365


In [124]:
# Part A, fitting a logistic regression
lr = LogisticRegression(C=10**5) # High C removes the regularization
X = Default[['income', 'balance']]
y = Default['default']
mod = lr.fit(X, y)
mod.coef_

array([[2.08089921e-05, 5.64710291e-03]])

In [125]:
# Part B (i) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=0)

In [126]:
# Part B (ii)
mod = lr.fit(X_train, y_train)
mod.coef_

array([[1.57956935e-05, 5.56457807e-03]])

In [127]:
# Part B (iii)
# Computing the posterior for each entry in the test set
pred = mod.predict_proba(X_test)
lr_labels = np.where(pred[:,1] > 0.5, 1, 0)


In [128]:
# Part B (iv)
incorr_pred = (lr_labels != y_test)
val_set_error = incorr_pred.sum() / len(y_test)
val_set_error

0.029

In [153]:
# Writing a function to do part B
def cross_val(test_size, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = test_size, random_state=0)
    mod = lr.fit(X_train, y_train)
    pred = mod.predict_proba(X_test)
    lr_labels = np.where(pred[:,1] > 0.5, 1, 0)
    val_set_error = (lr_labels != y_test).mean()
    return val_set_error

In [154]:
print(cross_val(0.5, X, y))

0.029


In [155]:
print(cross_val(0.2, X, y))

0.037


In [132]:
print(cross_val(0.1,X,y))

0.03


In [133]:
print(cross_val(0.6,X,y))

0.03816666666666667


In [156]:
print(cross_val(0.8,X,y))

0.027375


In [150]:
# Part D, fitting a new model using student predictor
Default['student'] = (Default['student'] == "Yes").astype(int)
Default

Unnamed: 0,default,student,balance,income
0,No,0,729.526495,44361.625074
1,No,1,817.180407,12106.134700
2,No,0,1073.549164,31767.138947
3,No,0,529.250605,35704.493935
4,No,0,785.655883,38463.495879
...,...,...,...,...
9995,No,0,711.555020,52992.378914
9996,No,0,757.962918,19660.721768
9997,No,0,845.411989,58636.156984
9998,No,0,1569.009053,36669.112365


In [157]:
X_stud = Default[['income','balance','student']]
print(cross_val(0.5, X_stud, y))

0.0356


In [158]:
print(cross_val(0.6, X_stud,y))

0.03816666666666667


In [159]:
print(cross_val(0.8, X_stud, y))

0.034375


In [None]:
# Clearly, using the student predictor did not help the test error rate

In [174]:
# Problem 6
# Part A
# Getting SE for coefficients

design = MS(['income','balance'])
X = design.fit_transform(Default)
y = Default['default']
glm = sm.GLM(y, X, family=sm.families.Binomial())
results = glm.fit()
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
intercept,-11.5405,0.435,-26.544,0.0
income,2.1e-05,5e-06,4.174,0.0
balance,0.0056,0.0,24.835,0.0


In [180]:
# Part B
def boot_fn(Default):
    mod1 = smf.glm('default ~ income + balance',data=Default, family=sm.families.Binomial()).fit()
    return mod1.params

In [181]:
boot_fn(Default)

Intercept   -11.540468
income        0.000021
balance       0.005647
dtype: float64

In [194]:
#bootstrap function
def boot(X, bootSample_size=None):
    #create random integers to use as indices for bootstrap sample based on original data
    bootSample_i = (np.random.rand(bootSample_size)*len(X)).astype(int)
    bootSample_i = np.array(bootSample_i)
    bootSample_X = X.iloc[bootSample_i]

    return bootSample_X

In [195]:

#running model for bootstrapped samples
coefficients = [] #variable initialization
n = 100 #number of bootstrapped samples

for i in range(0,n):
    coef_i = boot_fn(boot(Default)) #determining coefficients for specific bootstrapped sample
    coefficients.append(coef_i) #saving coefficients value

print(pd.DataFrame(coefficients).mean()) #print average of coefficients


TypeError: 'NoneType' object cannot be interpreted as an integer