In [1]:
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ISLP import load_data
import statsmodels.api as sm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import \
(cross_validate,
ShuffleSplit, 
KFold)

In [14]:
np.random.seed(42)
default = load_data('Default')
default = pd.DataFrame(default)
default.head()


Unnamed: 0,default,student,balance,income
0,No,No,729.526495,44361.625074
1,No,Yes,817.180407,12106.1347
2,No,No,1073.549164,31767.138947
3,No,No,529.250605,35704.493935
4,No,No,785.655883,38463.495879


### **(A)** Fit a logistic regression model (predictors: income,balance; response: default)

In [20]:
# define predictors
x = default[['balance','income']]
# add constant to x 
X_train = sm.add_constant(x)
default['default01'] = default['default'].map({'Yes':1, 'No': 0})
y_train = default['default01']
model_logReg = sm.GLM(y_train,X_train, family = sm.families.Binomial() ).fit()
model_logReg.summary()

0,1,2,3
Dep. Variable:,default01,No. Observations:,10000.0
Model:,GLM,Df Residuals:,9997.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-789.48
Date:,"Tue, 25 Feb 2025",Deviance:,1579.0
Time:,13:08:33,Pearson chi2:,6950.0
No. Iterations:,9,Pseudo R-squ. (CS):,0.1256
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-11.5405,0.435,-26.544,0.000,-12.393,-10.688
balance,0.0056,0.000,24.835,0.000,0.005,0.006
income,2.081e-05,4.99e-06,4.174,0.000,1.1e-05,3.06e-05


### **(B)** Estimate the test error of the model

In [30]:
# (i)
default_train, default_test = train_test_split(default, test_size=0.5, random_state=0)
# (ii) fit model using only training dataset
X_train = sm.add_constant(default_train[['balance', 'income']])
y_train = default_train['default01']
model_train = sm.GLM(y_train, X_train, family = sm.families.Binomial()).fit()

# (iii)
X_test = sm.add_constant(default_test[['balance', 'income']])
y_prob = model_train.predict(X_test)
# convert prob to labels
y_pred = (y_prob > 0.5).astype(int)

accuracy = accuracy_score(default_test['default01'], y_pred)
# test error
test_error = 1 - accuracy

print(f"Accuracy: {accuracy:.4f}")
print(f"Test Error Rate: {test_error:.4f}")


Accuracy: 0.9710
Test Error Rate: 0.0290


### **(C)** 
Repeat the process in (b) three times, using three different splits of the observations into a training set and a validation set. 


In [44]:
# Write a function of step B
def fit_logisticregression(data, predictors, response_var, split_ratio):
    # (i)
    default_train, default_test = train_test_split(data, test_size = split_ratio, random_state=0)
    # (ii) fit model using only training dataset
    X_train = sm.add_constant(default_train[predictors])
    y_train = default_train[response_var]
    model_train = sm.GLM(y_train, X_train, family = sm.families.Binomial()).fit()

    # (iii)
    X_test = sm.add_constant(default_test[predictors])
    y_prob = model_train.predict(X_test)
    # convert prob to labels
    y_pred = (y_prob > 0.5).astype(int)

    accuracy = accuracy_score(default_test[response_var], y_pred)
    # test error
    test_error = 1 - accuracy
    return test_error, accuracy



split_ratio = [0.3 ,0.5, 0.8]
results_list = []
for i in split_ratio:
    test_error, accuracy = fit_logisticregression(default, ['balance', 'income'], 'default01', i)
    results_list.append((i, accuracy, test_error))
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Test Error Rate: {test_error:.4f}")


Accuracy: 0.9727
Test Error Rate: 0.0273
Accuracy: 0.9710
Test Error Rate: 0.0290
Accuracy: 0.9726
Test Error Rate: 0.0274


### **(D)** Logistic regression with three predictors: income,balance, student

In [48]:
default['student01'] = default['student'].map({'Yes':1, 'No':0 })
test_error, accuracy = fit_logisticregression(default, ['balance', 'income', 'student01'], 'default01', 0.5)
print(f"Test Error Rate : {test_error: 0.4f} ")

Test Error Rate :  0.0292 
