In [3]:
import pandas as pd
import statsmodels.api as sm

# Load your Excel file and the "Q3" sheet
# Reference: https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
df = pd.read_excel("SMAA8411 A1 Submission - ST10055763 (K Maharajh).xlsx", sheet_name="Q3")

# Drop rows with missing values to avoid errors in model fitting
# Reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html
df = df.dropna(subset=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'y'])

# Define features (X) and target variable (y)
X = df[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10']]
y = df['y']

# Add constant column to include intercept in the model
# Reference: https://www.statsmodels.org/stable/generated/statsmodels.tools.tools.add_constant.html
X = sm.add_constant(X)

# Fit logistic regression model using Maximum Likelihood Estimation
# Reference: https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.Logit.html
model = sm.Logit(y, X).fit()

# Show regression results including coefficients and standard errors
# Reference: https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.BinaryResults.html
print(model.summary())

# Extract the coefficient (β9) and its standard error
beta_9 = model.params['x9']
se_9 = model.bse['x9']

# Calculate 95% confidence interval for β9 using normal approximation (z = 1.96)
# Reference: https://en.wikipedia.org/wiki/Confidence_interval#Basic_Steps
z = 1.96
ci_lower = beta_9 - z * se_9
ci_upper = beta_9 + z * se_9

print(f"\nCoefficient for β₉: {beta_9}")
print(f"Standard Error for β₉: {se_9}")
print(f"95% Confidence Interval for β₉: ({ci_lower}, {ci_upper})")


Optimization terminated successfully.
         Current function value: 0.204414
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  466
Model:                          Logit   Df Residuals:                      455
Method:                           MLE   Df Model:                           10
Date:                Thu, 10 Apr 2025   Pseudo R-squ.:                  0.7050
Time:                        22:21:38   Log-Likelihood:                -95.257
converged:                       True   LL-Null:                       -322.94
Covariance Type:            nonrobust   LLR p-value:                 1.500e-91
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        -34.8311      6.586     -5.289      0.000     -47.739     -21.923
x1            -0.0704      0