In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Default.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'default', 'student', 'balance', 'income'], dtype='object')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,default,student,balance,income
0,1,No,No,729.526495,44361.62507
1,2,No,Yes,817.180407,12106.1347
2,3,No,No,1073.549164,31767.13895
3,4,No,No,529.250605,35704.49394
4,5,No,No,785.655883,38463.49588


In [None]:
import statsmodels.api as sm
y = (df['default'].astype(str).str.lower().eq('yes')).astype(int).to_numpy()
# 
X = df[['income','balance']].to_numpy()
X = sm.add_constant(X)
X_names = ['const','income','balance']

In [7]:
model = sm.GLM(y, X, family=sm.families.Binomial())
res = model.fit()  

print(res.summary())            
se_glm = res.bse                 
coef_glm = res.params

se_table = pd.DataFrame({'coef': coef_glm, 'se_glm': se_glm}, index=X_names)
se_table.loc[['income','balance']]

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                10000
Model:                            GLM   Df Residuals:                     9997
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -789.48
Date:                Fri, 24 Oct 2025   Deviance:                       1579.0
Time:                        17:51:42   Pearson chi2:                 6.95e+03
No. Iterations:                     9   Pseudo R-squ. (CS):             0.1256
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -11.5405      0.435    -26.544      0.0

Unnamed: 0,coef,se_glm
income,2.1e-05,5e-06
balance,0.005647,0.000227


In [8]:
def boot_fn(data: pd.DataFrame, index: np.ndarray) -> np.ndarray:
    sub = data.iloc[index]
    y_sub = (sub['default'].astype(str).str.lower().eq('yes')).astype(int).to_numpy()
    X_sub = sm.add_constant(sub[['income','balance']].to_numpy())
    fit = sm.GLM(y_sub, X_sub, family=sm.families.Binomial()).fit()
  
    beta_income = fit.params[1]
    beta_balance = fit.params[2]
    return np.array([beta_income, beta_balance])

In [9]:
rng = np.random.default_rng(2024)
n = len(df)
B = 10_000 

boot_coefs = np.empty((B, 2))  

for b in range(B):
    idx = rng.integers(0, n, size=n)  
    boot_coefs[b] = boot_fn(df, idx)

# Bootstrap 标准误差 = 系数的样本标准差
se_boot = boot_coefs.std(axis=0, ddof=1)
coef_boot_mean = boot_coefs.mean(axis=0)

se_compare = pd.DataFrame({
    'coef_glm'     : [coef_glm[1],      coef_glm[2]],
    'se_glm'       : [se_glm[1],        se_glm[2]],
    'coef_boot_avg': [coef_boot_mean[0],coef_boot_mean[1]],
    'se_boot'      : [se_boot[0],       se_boot[1]],
}, index=['income','balance'])

se_compare

Unnamed: 0,coef_glm,se_glm,coef_boot_avg,se_boot
income,2.1e-05,5e-06,2.1e-05,5e-06
balance,0.005647,0.000227,0.00566,0.000227
