In [2]:
import numpy as np
import statsmodels.api as sm
import random

from ISLP import load_data
from ISLP.models import (ModelSpec as MS, summarize, poly)
from sklearn.model_selection import train_test_split

from functools import partial
from sklearn.model_selection import (cross_validate, KFold, ShuffleSplit, cross_val_score)
from sklearn.base import clone
from sklearn.base import BaseEstimator, ClassifierMixin
from ISLP.models import sklearn_sm

### Problem 5:

(a) Fit a logistic regression using 'income' and 'balance' to predict 'default'

(b) Use validation set approach for 1 iteration

(c) Repeat 10 times: the average accuracy is 97.47%

(d) Using additional 'student': the average accuracy is 97.42%

In [3]:
Default = load_data('Default')
Default.head()

Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


In [4]:
X = Default[['balance', 'income']]
X = sm.add_constant(X)
y = Default['default'].map({'No': 0, 'Yes': 1})
y.value_counts()

default
0    9667
1     333
Name: count, dtype: int64

In [5]:
model = sm.GLM(y, X, family=sm.families.Binomial())
result = model.fit()
summarize(result)

Unnamed: 0,coef,std err,z,P>|z|
const,-11.5405,0.435,-26.544,0.0
balance,0.0056,0.0,24.835,0.0
income,2.1e-05,5e-06,4.174,0.0


In [6]:
probs = result.predict()
pred = (probs >= 0.5).astype(int)
print(f'Average accuracy of 1 split: {(pred == y).mean()}')

Average accuracy of 1 split: 0.9737


In [7]:
score = []
for rand in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5000, random_state=rand)
    model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
    result = model.fit()

    probs = result.predict(X_test)
    pred = (probs >= 0.5).astype(int)
    score.append((pred == y_test).mean())
print(f'Average accuracy of 10 different splits: {np.mean(score)}')

Average accuracy of 10 different splits: 0.9747


In [8]:
X = Default[['balance', 'income', 'student']]
X = sm.add_constant(X)
X['student'] = X['student'].map({'No': 0, 'Yes': 1})

score = []
for rand in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=5000, random_state=rand)
    model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
    result = model.fit()

    probs = result.predict(X_test)
    pred = (probs >= 0.5).astype(int)
    score.append((pred == y_test).mean())

print(f'Average accuracy of 10 different splits: {np.mean(score)}')


Average accuracy of 10 different splits: 0.9742799999999999


In [9]:
X = Default[['balance', 'income', 'student']]
X = sm.add_constant(X)
X['student'] = X['student'].map({'No': 0, 'Yes': 1})
y = Default['default'].map({'No': 0, 'Yes': 1})

class StatsModelsGLM(BaseEstimator, ClassifierMixin):
    def __init__(self, family=sm.families.Binomial()):
        self.family = family

    def fit(self, X, y):
        self.model = sm.GLM(y, X, family=self.family).fit()
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        probs = self.model.predict(X)
        return (probs >= 0.5).astype(int)

    def predict_proba(self, X):
        probs = self.model.predict(X)
        return np.vstack([1 - probs, probs]).T

glm_clf = StatsModelsGLM()
kf = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(glm_clf, X, y, cv=kf, scoring='accuracy')

print(f'Average accuracy of 10-fold cross-validation: {np.mean(scores)}')

Average accuracy of 10-fold cross-validation: 0.9730000000000001


### Problem 6:

(a) Using summary of GLM, standard error is negligible < (10^-6)

(b) Write boot_fn and boot_se function for bootstrap standard error

(c) Using bootstrap, standard error is 2*(10^-4) and 5*(10^-6) respectively

(d) Standard error by bootstrap is more reliable, as no requirement on model assumptions

In [10]:
X = Default[['balance', 'income']]
X = sm.add_constant(X)
y = Default['default'].map({'No': 0, 'Yes': 1})

model = sm.GLM(y, X, family=sm.families.Binomial())
result = model.fit()
summarize(result)

Unnamed: 0,coef,std err,z,P>|z|
const,-11.5405,0.435,-26.544,0.0
balance,0.0056,0.0,24.835,0.0
income,2.1e-05,5e-06,4.174,0.0


In [14]:
def boot_fn(y, X, idx):
    y_train = y.iloc[idx]
    X_train = X.iloc[idx]

    model = sm.GLM(y_train, X_train, family=sm.families.Binomial()).fit()

    return model.params

def boot_se(y, X, n_bootstrap=1000):
    n = len(y)
    boot_estimates = np.zeros((n_bootstrap, X.shape[1]))

    for i in range(n_bootstrap):
        idx = np.random.choice(np.arange(n), size=n, replace=True)
        boot_estimates[i, :] = boot_fn(y, X, idx)
    
    # Compute standard errors from bootstrap estimates
    boot_std_errors = np.std(boot_estimates, axis=0)
    return boot_std_errors
    
print(f'Standard error of coef: {boot_se(y, X, 1000)}')


Standard error of coef: [4.23988422e-01 2.25115248e-04 4.90176729e-06]
