Drawn from https://www.statsmodels.org/dev/examples/notebooks/generated/glm_formula.html.

In [96]:
# Summon libraries.
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd

Call data.

In [97]:
# Call data. Uses PANDAS.
cardiac = pd.read_csv('../../data/Exercise3.5Data.csv')

# Since each model has identical formula, develop it once.
predict_val = pd.DataFrame(
    {"A" : 2, "W" : 4}, index=[0])

Logit models.

In [98]:
formula = 'group ~ A + W'
fitted_logit = smf.logit(formula = formula, data=cardiac).fit()
print(fitted_logit.summary())
fitloglike_logit = (fitted_logit.llf)

Optimization terminated successfully.
         Current function value: 0.605315
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                  group   No. Observations:                   48
Model:                          Logit   Df Residuals:                       45
Method:                           MLE   Df Model:                            2
Date:                Fri, 27 Oct 2023   Pseudo R-squ.:                  0.1267
Time:                        15:40:09   Log-Likelihood:                -29.055
converged:                       True   LL-Null:                       -33.271
Covariance Type:            nonrobust   LLR p-value:                   0.01476
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.1160      0.890     -1.255      0.210      -2.860       0.628
A              0.4378      0.

In [99]:
# The three models don't vary in how many parameters they produce, 
# Thus, we can set model_param to fitted.df_model once.
# df_model is number of regressors. Add 1 to get number of parameters.
# Keep df_model for degrees of freedom.
model_param = fitted_logit.df_model + 1
deg_free = fitted_logit.df_model

Logit AIC, BIC, AICC.

In [100]:
# AIC, BIC are built into the regression. 
# AICC needs to be developed.
aic_logit = fitted_logit.aic
aicc_logit = sm.tools.eval_measures.aicc(
# Value of log likelihood function.
    fitloglike_logit,
# Number of observations.
    fitted_logit.nobs,
    model_param)
bic_logit = fitted_logit.bic

In [101]:
# Null model.
null = smf.logit('group ~ 1', data=cardiac).fit()
nullloglike_logit = (null.llf)

Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 1


Logit log likelihood. Found the names through dir() https://stackoverflow.com/questions/2675028/list-attributes-of-an-object.

In [102]:
# Uses null and fitted log likelihoods to perform the deviance test.
deviance= -2 * (nullloglike_logit-(fitloglike_logit))
print(f"Deviance statistic is {deviance}.")
# Chi2.cdf is from scipy.stats.
from scipy.stats import chi2
pvalue = 1 - chi2.cdf(deviance,deg_free)
print(f"p-value is {pvalue}.")

Deviance statistic is 8.431899815840197.
p-value is 0.014758296084250944.


Logit prediction.

In [103]:
# Prediction.
predict_val = sm.add_constant(predict_val)
# Simpler.
print(f'Predicted: {fitted_logit.predict(predict_val)}')

# Isolated. This one grabs the item "values" from the array.
print(f'Predicted: {fitted_logit.predict(predict_val).values[0]}')

Predicted: 0    0.46758
dtype: float64
Predicted: 0.46757964801353874


Probit models.

In [104]:
formula = 'group ~ A + W'
fitted_probit = smf.probit(formula=formula, data=cardiac).fit()
print(fitted_probit.summary())
fitloglike_probit = (fitted_probit.llf)

Optimization terminated successfully.
         Current function value: 0.607311
         Iterations 5
                          Probit Regression Results                           
Dep. Variable:                  group   No. Observations:                   48
Model:                         Probit   Df Residuals:                       45
Method:                           MLE   Df Model:                            2
Date:                Fri, 27 Oct 2023   Pseudo R-squ.:                  0.1238
Time:                        15:40:09   Log-Likelihood:                -29.151
converged:                       True   LL-Null:                       -33.271
Covariance Type:            nonrobust   LLR p-value:                   0.01624
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.6300      0.519     -1.214      0.225      -1.647       0.387
A              0.2490      0.

In [105]:
# Null model.
null_probit = smf.probit('group ~ 1', data=cardiac).fit()
nullloglike_probit = (null_probit.llf)

Optimization terminated successfully.
         Current function value: 0.693147
         Iterations 1


Probit log likelihood.

In [106]:
# Uses null and fitted log likelihoods to perform the deviance test.
deviance= -2 * (nullloglike_probit-(fitloglike_probit))
print(f"Deviance statistic is {deviance}.")
# Chi2.cdf is from scipy.stats.
from scipy.stats import chi2
pvalue = 1 - chi2.cdf(deviance,deg_free)
print(f"p-value is {pvalue}.")

Deviance statistic is 8.24031743326293.
p-value is 0.016241936371934607.


Probit AIC, BIC, AICC.

In [107]:
# AIC, BIC are built into the regression. 
# AICC needs to be developed.
aic_probit = fitted_probit.aic
aicc_probit = sm.tools.eval_measures.aicc(
# Value of log likelihood function.
    fitloglike_probit,
# Number of observations.
    fitted_probit.nobs,
    model_param)
bic_probit = fitted_probit.bic

Probit prediction.

In [108]:
# Prediction.
predict_val = sm.add_constant(predict_val)
# Simpler.
print(f'Predicted: {fitted_probit.predict(predict_val)}')

# Isolated. This one grabs the item "values" from the array.
print(f'Predicted: {fitted_probit.predict(predict_val).values[0]}')

Predicted: 0    0.464077
dtype: float64
Predicted: 0.4640766442933432


Cloglog models. Found format through https://github.com/statsmodels/statsmodels/issues/827. 

In [109]:
# Cloglog. 
# Note that this uses a binomial family with cloglog link, akin to R.
# Furthermore, this similar format can be used for Probit and Logit, just changing the link function.
# It just happens there's no smf.cloglog.
formula = 'group ~ A + W'
fitted_cloglog = smf.glm(formula=formula, data=cardiac,family=sm.families.Binomial(sm.families.links.CLogLog ())).fit()
print(fitted_cloglog.summary())
fitloglike_cloglog = (fitted_cloglog.llf)

# Null model.
null_cloglog = smf.glm(formula = 'group ~ 1', data=cardiac, family=sm.families.Binomial(sm.families.links.CLogLog ())).fit()
nullloglike_cloglog = (null_cloglog.llf)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  group   No. Observations:                   48
Model:                            GLM   Df Residuals:                       45
Model Family:                Binomial   Df Model:                            2
Link Function:                CLogLog   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -29.524
Date:                Fri, 27 Oct 2023   Deviance:                       59.047
Time:                        15:40:09   Pearson chi2:                     49.8
No. Iterations:                     9   Pseudo R-squ. (CS):             0.1446
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.0392      0.629     -1.653      0.0

Cloglog log likelihood.

In [110]:
# Uses null and fitted log likelihoods to perform the deviance test.
deviance= -2 * (nullloglike_cloglog-(fitloglike_cloglog))
print(f"Deviance statistic is {deviance}.")
# Chi2.cdf is from scipy.stats.
from scipy.stats import chi2
pvalue = 1 - chi2.cdf(deviance,deg_free)
print(f"p-value is {pvalue}.")

Deviance statistic is 7.49502521126535.
p-value is 0.023576316578627576.


Cloglog AIC, BIC, AICC.

In [111]:
# AIC, BIC are built into the regression. 
# AICC needs to be developed.
aic_cloglog = fitted_cloglog.aic
aicc_cloglog = sm.tools.eval_measures.aicc(
# Value of log likelihood function.
    fitloglike_cloglog,
# Number of observations.
    fitted_cloglog.nobs,
    model_param)
# statsmodel alludes to using a deviance-based bic.
# _llf makes it use a log-likelihood function-based bic.
bic_cloglog = fitted_cloglog.bic_llf

Cloglog prediction. https://www.statology.org/statsmodels-predict/

In [112]:
# Prediction.
predict_val = sm.add_constant(predict_val)
# Simpler.
print(f'Predicted: {fitted_cloglog.predict(predict_val)}')

# Isolated. This one grabs the item "values" from the array.
print(f'Predicted: {fitted_cloglog.predict(predict_val).values[0]}')

Predicted: 0    0.45612
dtype: float64
Predicted: 0.4561195320960084


Model comparisons. From https://www.geeksforgeeks.org/different-ways-to-create-pandas-dataframe/

In [113]:
data = [[aic_logit,aicc_logit,bic_logit],
        [aic_probit,aicc_probit,bic_probit],
        [aic_cloglog,aicc_cloglog,bic_cloglog]]

comparison = pd.DataFrame(data,
                          columns = ['AIC','AICC','BIC'],
                          index = ['logit','probit','cloglog'])

comparison

Unnamed: 0,AIC,AICC,BIC
logit,64.11023,64.655684,69.723833
probit,64.301812,64.847266,69.915415
cloglog,65.047104,65.592559,70.660707


Automated comparison. 
https://www.geeksforgeeks.org/get-minimum-values-in-rows-or-columns-with-their-index-position-in-pandas-dataframe/

In [114]:
# Initialize.
logit = 0
probit = 0
cloglog = 0 

# Runs through all 3 columns.
# If column has lowest value, that model +1 point.
# Model with highest points is declared at end.
def points(cur_min,logit,probit,cloglog):
        if cur_min == "logit":
                logit += 1
        elif cur_min == "probit":
                probit += 1
        else:
                cloglog += 1 
        return(logit,probit,cloglog)

# Runs through the models.
cur_min = comparison['AIC'].idxmin()
logit,probit,cloglog = points(cur_min,logit,probit,cloglog)
cur_min = comparison['AICC'].idxmin()
logit,probit,cloglog = points(cur_min,logit,probit,cloglog)
cur_min = comparison['BIC'].idxmin()
logit,probit,cloglog = points(cur_min,logit,probit,cloglog)

final = {logit:"logit",probit:"probit",cloglog:"cloglog"}

print(f"The model with the best fit is the {final.get(max(final))} model.")

The model with the best fit is the logit model.
