Summon libraries.

In [231]:
# Summon libraries.
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel as OrdModel
import pandas as pd
from pandas import Series

Call data.

In [232]:
# Call data. Uses PANDAS.
satisfaction = pd.read_csv('../../data/Exercise4.2Data.csv')

# Since each model has identical formula, develop it once.
formula = 'satisf ~ C(magazine,Treatment("yes")) + C(resolved,Treatment("yes")) + subscribed'

## Logit

Logit models.

In [233]:
# Create formula, then create the model and output the model.
fitted_logit_f = OrdModel.from_formula(formula, satisfaction, distr = "logit")
fitted_logit = fitted_logit_f.fit(method='bfgs', disp=False)
print(fitted_logit.summary())

# Untransform models.
num_of_thresholds = 4
fitted_logit_f.transform_threshold_params(fitted_logit.params[-num_of_thresholds:])

                             OrderedModel Results                             
Dep. Variable:                 satisf   Log-Likelihood:                -47.049
Model:                   OrderedModel   AIC:                             108.1
Method:            Maximum Likelihood   BIC:                             119.2
Date:                Mon, 13 Nov 2023                                         
Time:                        15:01:56                                         
No. Observations:                  36                                         
Df Residuals:                      29                                         
Df Model:                           3                                         
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(magazine, Treatment("yes"))[T.no]    -1.9175      0.677     -2.832      0.005  

array([       -inf, -4.23594035, -2.82984236, -1.57402272,  0.33501565,
               inf])

In [234]:
# The three models don't vary in how many parameters they produce, 
# Thus, we can set model_param to count the number of parameters by fitted_logit.params.
# Pull df_model for degrees of freedom.
model_param = (fitted_logit.params).count()
deg_free = fitted_logit.df_model

Logit AIC, BIC, AICC.

In [235]:
# AIC, BIC are built into the regression. 
# AICC needs to be developed.
aic_logit = fitted_logit.aic
aicc_logit = sm.tools.eval_measures.aicc(
# Value of log likelihood function.
    fitted_logit.llf,
# Number of observations.
    fitted_logit.nobs,
    model_param)
bic_logit = fitted_logit.bic

Logit log likelihood.

In [236]:
# Pull null log likelihood function from initial model.
nullloglike_logit = (fitted_logit.llnull)

In [237]:
# Uses null and fitted log likelihoods to perform the deviance test.
deviance= -2 * (fitted_logit.llnull-(fitted_logit.llf))
print(f"Deviance statistic is {deviance}.")
# Chi2.cdf is from scipy.stats.
from scipy.stats import chi2
pvalue = 1 - chi2.cdf(deviance,deg_free)
print(f"p-value is {pvalue}.")

Deviance statistic is 14.799395808783018.
p-value is 0.0019963569168444106.


Logit prediction.

In [238]:
# Prediction.
predict_val = pd.DataFrame(
    {"subscribed" : 3, "magazine" : "no", "resolved" :  "yes"}, index=[0])
predict_val = sm.add_constant(predict_val)
# Simpler.
print(f'Predicted: {fitted_logit.predict(predict_val)}')

# Isolated. This one grabs the item "values" from the array.
print(f'Predicted: {fitted_logit.predict(predict_val).values[0]}')

Predicted:           0        1         2         3         4
0  0.087067  0.19305  0.297241  0.324759  0.097882
Predicted: [0.08706747 0.19304966 0.29724134 0.32475906 0.09788247]


## Probit

Probit models.

In [239]:
# Create formula, create the model, and output the model.
fitted_probit_f = OrdModel.from_formula(formula, satisfaction, distr = "probit")
fitted_probit = fitted_probit_f.fit(method='bfgs', disp=False)
print(fitted_probit.summary())

# Untransform models. Needs formula and model.
num_of_thresholds = 4
fitted_probit_f.transform_threshold_params(fitted_probit.params[-num_of_thresholds:])

                             OrderedModel Results                             
Dep. Variable:                 satisf   Log-Likelihood:                -46.693
Model:                   OrderedModel   AIC:                             107.4
Method:            Maximum Likelihood   BIC:                             118.5
Date:                Mon, 13 Nov 2023                                         
Time:                        15:01:57                                         
No. Observations:                  36                                         
Df Residuals:                      29                                         
Df Model:                           3                                         
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
C(magazine, Treatment("yes"))[T.no]    -1.2027      0.388     -3.096      0.002  

array([       -inf, -2.60051758, -1.78780386, -1.02949075,  0.11135801,
               inf])

In [240]:
# The three models don't vary in how many parameters they produce, 
# Thus, we can set model_param to count the number of parameters by fitted_probit.params.
# Pull df_model for degrees of freedom.
model_param = (fitted_probit.params).count()
deg_free = fitted_probit.df_model

Probit AIC, BIC, AICC.

In [241]:
# AIC, BIC are built into the regression. 
# AICC needs to be developed.
aic_probit = fitted_probit.aic
aicc_probit = sm.tools.eval_measures.aicc(
# Value of log likelihood function.
    fitted_probit.llf,
# Number of observations.
    fitted_probit.nobs,
    model_param)
bic_probit = fitted_probit.bic

Probit log likelihood.

In [242]:
# Pull null log likelihood function from initial model.
nullloglike_probit = (fitted_probit.llnull)

In [243]:
# Uses null and fitted log likelihoods to perform the deviance test.
deviance= -2 * (fitted_probit.llnull-(fitted_probit.llf))
print(f"Deviance statistic is {deviance}.")
# Chi2.cdf is from scipy.stats.
from scipy.stats import chi2
pvalue = 1 - chi2.cdf(deviance,deg_free)
print(f"p-value is {pvalue}.")

Deviance statistic is 15.51138156208296.
p-value is 0.001427906255025424.


Probit prediction.

In [244]:
# Prediction.
predict_val = pd.DataFrame(
    {"subscribed" : 3, "magazine" : "no", "resolved" :  "yes"}, index=[0])
predict_val = sm.add_constant(predict_val)
# Simpler.
print(f'Predicted: {fitted_probit.predict(predict_val)}')

# Isolated. This one grabs the item "values" from the array.
print(f'Predicted: {fitted_probit.predict(predict_val).values[0]}')

Predicted:           0         1        2         3         4
0  0.078385  0.194768  0.28844  0.340895  0.097513
Predicted: [0.07838474 0.19476755 0.28843952 0.34089547 0.09751271]


## Cloglog

Cloglog models.

In [249]:
# Create formula, create the model, and output the model.
fitted_cloglog_f = OrdModel.from_formula(formula, satisfaction, distr = "cloglog")
fitted_cloglog = fitted_cloglog_f.fit(method='bfgs', disp=False)
print(fitted_cloglog.summary())

# Untransform models. Needs formula and model.
num_of_thresholds = 4
fitted_cloglog_f.transform_threshold_params(fitted_cloglog.params[-num_of_thresholds:])

AttributeError: 'str' object has no attribute 'name'

In [None]:
# The three models don't vary in how many parameters they produce, 
# Thus, we can set model_param to count the number of parameters by fitted_cloglog.params.
# Pull df_model for degrees of freedom.
model_param = (fitted_cloglog.params).count()
deg_free = fitted_cloglog.df_model

Cloglog AIC, BIC, AICC.

In [None]:
# AIC, BIC are built into the regression. 
# AICC needs to be developed.
aic_cloglog = fitted_cloglog.aic
aicc_cloglog = sm.tools.eval_measures.aicc(
# Value of log likelihood function.
    fitted_cloglog.llf,
# Number of observations.
    fitted_cloglog.nobs,
    model_param)
bic_cloglog = fitted_cloglog.bic

Cloglog log likelihood.

In [None]:
# Pull null log likelihood function from initial model.
nullloglike_cloglog = (fitted_cloglog.llnull)

In [None]:
# Uses null and fitted log likelihoods to perform the deviance test.
deviance= -2 * (fitted_cloglog.llnull-(fitted_cloglog.llf))
print(f"Deviance statistic is {deviance}.")
# Chi2.cdf is from scipy.stats.
from scipy.stats import chi2
pvalue = 1 - chi2.cdf(deviance,deg_free)
print(f"p-value is {pvalue}.")

Deviance statistic is 15.51138156208296.
p-value is 0.001427906255025424.


Cloglog prediction.

In [None]:
# Prediction.
predict_val = pd.DataFrame(
    {"subscribed" : 3, "magazine" : "no", "resolved" :  "yes"}, index=[0])
predict_val = sm.add_constant(predict_val)
# Simpler.
print(f'Predicted: {fitted_cloglog.predict(predict_val)}')

# Isolated. This one grabs the item "values" from the array.
print(f'Predicted: {fitted_cloglog.predict(predict_val).values[0]}')

Predicted:           0         1        2         3         4
0  0.078385  0.194768  0.28844  0.340895  0.097513
Predicted: [0.07838474 0.19476755 0.28843952 0.34089547 0.09751271]


## Model comparisons.

In [None]:
data = [[aic_logit,aicc_logit,bic_logit],
        [aic_logit,aicc_logit,bic_logit],
        [aic_cloglog,aicc_cloglog,bic_cloglog]]

comparison = pd.DataFrame(data,
                          columns = ['AIC','AICC','BIC'],
                          index = ['logit','logit','cloglog'])

comparison

NameError: name 'aic_cloglog' is not defined

Automated comparison.

In [None]:
# Initialize.
logit = 0
logit = 0
cloglog = 0 

# Runs through all 3 columns.
# If column has lowest value, that model +1 point.
# Model with highest points is declared at end.
def points(cur_min,logit,logit,cloglog):
        if cur_min == "logit":
                logit += 1
        elif cur_min == "logit":
                logit += 1
        else:
                cloglog += 1 
        return(logit,logit,cloglog)

# Runs through the models.
cur_min = comparison['AIC'].idxmin()
logit,logit,cloglog = points(cur_min,logit,logit,cloglog)
cur_min = comparison['AICC'].idxmin()
logit,logit,cloglog = points(cur_min,logit,logit,cloglog)
cur_min = comparison['BIC'].idxmin()
logit,logit,cloglog = points(cur_min,logit,logit,cloglog)

final = {logit:"logit",logit:"logit",cloglog:"cloglog"}

print(f"The model with the best fit is the {final.get(max(final))} model.")