Summon libraries.

In [1]:
# Summon libraries.
import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel as OrdModel
import pandas as pd
from pandas import Series as ser
from pandas import Categorical as cat
from scipy import stats
import numpy as np

Call data.

In [2]:
# Call data. Uses PANDAS.
admissions = pd.read_csv('.\data\math dept.csv')

In [3]:
# Set admission categories.
admissions['Status'] = pd.Categorical(admissions.Status)
admissions['Status'] = admissions.Status.cat.reorder_categories(['Enrolled','Admitted','Applied'], ordered = True)
# Merge two or more races, unknown, and visas.
mapping = {'Two or More Races' : 'Other', 'Unknown' : 'Other', 'Visa Non-U.S.' : 'Other'}
admissions['Ethnicity'] = admissions['Ethnicity'].replace(mapping)
# Discard unknown genders.
admissions = admissions[admissions['Sex'] != 'Unknown']
# Since each model has identical formula, develop it once.
formula = 'Status ~ HS_GPA + C(Major,Treatment("Pre-Applied Statistics")) + C(Ethnicity,Treatment("White")) + C(Sex,Treatment("Male")) + C(Local,Treatment("Non-Local"))'

## Logit

In [4]:
# Set categories.
admissions['Status'] = admissions.Status.cat.reorder_categories(['Enrolled','Admitted','Applied'], ordered = True)
# Merge two or more races, unknown, and visas.
mapping = {'Two or More Races' : 'Other', 'Unknown' : 'Other', 'Visa Non-U.S.' : 'Other'}
admissions['Ethnicity'] = admissions['Ethnicity'].replace(mapping)
# Discard unknown genders.
admissions = admissions[admissions['Sex'] != 'Unknown']
# Since each model has identical formula, develop it once.
formula = 'Status ~ HS_GPA + C(Major,Treatment("Pre-Applied Statistics")) + C(Ethnicity,Treatment("White")) + C(Sex,Treatment("Male")) + C(Local,Treatment("Non-Local"))'

Logit models.

In [5]:
# Create formula, then create the model and output the model.
fitted_logit_f = OrdModel.from_formula(formula, admissions, distr = "logit")
fitted_logit = fitted_logit_f.fit(method='bfgs', disp=False)
print(fitted_logit.summary())

# Untransform models.
num_of_thresholds = 3
fitted_logit_f.transform_threshold_params(fitted_logit.params[-num_of_thresholds:])

                             OrderedModel Results                             
Dep. Variable:                 Status   Log-Likelihood:                -515.71
Model:                   OrderedModel   AIC:                             1055.
Method:            Maximum Likelihood   BIC:                             1110.
Date:                Tue, 28 Nov 2023                                         
Time:                        02:12:28                                         
No. Observations:                 719                                         
Df Residuals:                     707                                         
Df Model:                          10                                         
                                                                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------------------------------
C(Major, Treatmen

array([        -inf, -16.32556048, -12.62717124,          inf])

In [6]:
# The three models don't vary in how many parameters they produce, 
# Thus, we can set model_param to count the number of parameters by fitted_logit.params.
# Pull df_model for degrees of freedom.
model_param = (fitted_logit.params).count()
deg_free = fitted_logit.df_model

Logit AIC, BIC, AICC.

In [7]:
# AIC, BIC are built into the regression. 
# AICC needs to be developed.
aic_logit = fitted_logit.aic
aicc_logit = sm.tools.eval_measures.aicc(
# Value of log likelihood function.
    fitted_logit.llf,
# Number of observations.
    fitted_logit.nobs,
    model_param)
bic_logit = fitted_logit.bic

Logit log likelihood.

In [8]:
# Pull null log likelihood function from initial model.
nullloglike_logit = (fitted_logit.llnull)

In [9]:
# Uses null and fitted log likelihoods to perform the deviance test.
deviance= -2 * (fitted_logit.llnull-(fitted_logit.llf))
print(f"Deviance statistic is {deviance}.")
# Chi2.cdf is from scipy.stats.
from scipy.stats import chi2
pvalue = 1 - chi2.cdf(deviance,deg_free)
print(f"p-value is {pvalue}.")

Deviance statistic is 319.5161182194893.
p-value is 0.0.


Logit prediction.

In [10]:
# Prediction.
predict_val = pd.DataFrame(
    {"HS_GPA" : 3.23, "Major" : "Pre-Math Applied", "Ethnicity" :  "Black or African American", "Sex": "Female", "Local": "Local"}, index=[0])
predict_val = sm.add_constant(predict_val)

# Isolated. This one grabs the item "values" from the array.
print(f'Predicted: {fitted_logit.predict(predict_val).values[0]}')

Predicted: [0.06101709 0.66305394 0.27592897]


## Probit

Probit models.

In [11]:
# Create formula, then create the model and output the model.
fitted_probit_f = OrdModel.from_formula(formula, admissions, distr = "probit")
fitted_probit = fitted_probit_f.fit(method='bfgs', disp=False)
print(fitted_probit.summary())

# Untransform models.
num_of_thresholds = 3
fitted_logit_f.transform_threshold_params(fitted_probit.params[-num_of_thresholds:])

                             OrderedModel Results                             
Dep. Variable:                 Status   Log-Likelihood:                -530.67
Model:                   OrderedModel   AIC:                             1085.
Method:            Maximum Likelihood   BIC:                             1140.
Date:                Tue, 28 Nov 2023                                         
Time:                        02:12:29                                         
No. Observations:                 719                                         
Df Residuals:                     707                                         
Df Model:                          10                                         
                                                                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------------------------------
C(Major, Treatmen

array([       -inf, -8.54294523, -6.50503455,         inf])

Probit AIC, BIC, AICC.

In [12]:
# AIC, BIC are built into the regression. 
# AICC needs to be developed.
aic_probit = fitted_probit.aic
aicc_probit = sm.tools.eval_measures.aicc(
# Value of log likelihood function.
    fitted_probit.llf,
# Number of observations.
    fitted_probit.nobs,
    model_param)
bic_probit = fitted_probit.bic

Probit log likelihood.

In [13]:
# Pull null log likelihood function from initial model.
nullloglike_probit = (fitted_probit.llnull)

In [14]:
# Uses null and fitted log likelihoods to perform the deviance test.
deviance= -2 * (fitted_probit.llnull-(fitted_probit.llf))
print(f"Deviance statistic is {deviance}.")
# Chi2.cdf is from scipy.stats.
from scipy.stats import chi2
pvalue = 1 - chi2.cdf(deviance,deg_free)
print(f"p-value is {pvalue}.")

Deviance statistic is 289.5849011139344.
p-value is 0.0.


Probit prediction.

In [15]:
# Prediction.
predict_val = pd.DataFrame(
    {"HS_GPA" : 3.23, "Major" : "Pre-Math Applied", "Ethnicity" :  "Black or African American", "Sex": "Female", "Local": "Local"}, index=[0])
predict_val = sm.add_constant(predict_val)

# Isolated. This one grabs the item "values" from the array.
print(f'Predicted: {fitted_probit.predict(predict_val).values[0]}')

Predicted: [0.08378053 0.66089501 0.25532446]


## Cloglog

Cloglog models.

In [16]:
# Develop cloglog.
class cloglog(stats.rv_continuous):
    def _ppf(self, q):
        return np.log(-np.log(1 - q))

    def _cdf(self, x):
        return 1 - np.exp(-np.exp(x))

cloglog = cloglog()

In [17]:
# Create formula, create the model, and output the model.
fitted_cloglog_f = OrdModel.from_formula(formula, admissions, distr = cloglog)
fitted_cloglog = fitted_cloglog_f.fit(method='bfgs', disp=False)
print(fitted_cloglog.summary())

# Untransform models. Needs formula and model.
num_of_thresholds = 3
fitted_cloglog_f.transform_threshold_params(fitted_cloglog.params[-num_of_thresholds:])

                             OrderedModel Results                             
Dep. Variable:                 Status   Log-Likelihood:                -486.85
Model:                   OrderedModel   AIC:                             997.7
Method:            Maximum Likelihood   BIC:                             1053.
Date:                Tue, 28 Nov 2023                                         
Time:                        02:12:29                                         
No. Observations:                 719                                         
Df Residuals:                     707                                         
Df Model:                          10                                         
                                                                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------------------------------------------
C(Major, Treatmen

array([        -inf, -13.01635096, -10.2052111 ,          inf])

Cloglog AIC, BIC, AICC.

In [18]:
# AIC, BIC are built into the regression. 
# AICC needs to be developed.
aic_cloglog = fitted_cloglog.aic
aicc_cloglog = sm.tools.eval_measures.aicc(
# Value of log likelihood function.
    fitted_cloglog.llf,
# Number of observations.
    fitted_cloglog.nobs,
    model_param)
bic_cloglog = fitted_cloglog.bic

Cloglog log likelihood.

In [19]:
# Pull null log likelihood function from initial model.
nullloglike_cloglog = (fitted_cloglog.llnull)

In [20]:
# Uses null and fitted log likelihoods to perform the deviance test.
deviance= -2 * (fitted_cloglog.llnull-(fitted_cloglog.llf))
print(f"Deviance statistic is {deviance}.")
# Chi2.cdf is from scipy.stats.
from scipy.stats import chi2
pvalue = 1 - chi2.cdf(deviance,deg_free)
print(f"p-value is {pvalue}.")

Deviance statistic is 377.23186828042356.
p-value is 0.0.


Cloglog prediction.

In [21]:
# Prediction.
predict_val = pd.DataFrame(
    {"HS_GPA" : 3.23, "Major" : "Pre-Math Applied", "Ethnicity" :  "Black or African American", "Sex": "Female", "Local": "Local"}, index=[0])
predict_val = sm.add_constant(predict_val)

# Isolated. This one grabs the item "values" from the array.
print(f'Predicted: {fitted_cloglog.predict(predict_val).values[0]}')

Predicted: [0.06628941 0.61406835 0.31964224]


## Model comparisons.

In [22]:
data = [[aic_logit,aicc_logit,bic_logit],
        [aic_logit,aicc_logit,bic_logit],
        [aic_cloglog,aicc_cloglog,bic_cloglog]]

comparison = pd.DataFrame(data,
                          columns = ['AIC','AICC','BIC'],
                          index = ['logit','logit','cloglog'])

comparison

Unnamed: 0,AIC,AICC,BIC
logit,1055.416336,1055.858262,1110.350672
logit,1055.416336,1055.858262,1110.350672
cloglog,997.700586,998.142512,1052.634922


Automated comparison.

In [23]:
# Initialize.
logit = 0
logit = 0
cloglog = 0 

# Runs through all 3 columns.
# If column has lowest value, that model +1 point.
# Model with highest points is declared at end.
def points(cur_min,logit,probit,cloglog):
        if cur_min == "logit":
                logit += 1
        elif cur_min == "logit":
                logit += 1
        else:
                cloglog += 1 
        return(logit,logit,cloglog)

# Runs through the models.
cur_min = comparison['AIC'].idxmin()
logit,logit,cloglog = points(cur_min,logit,logit,cloglog)
cur_min = comparison['AICC'].idxmin()
logit,logit,cloglog = points(cur_min,logit,logit,cloglog)
cur_min = comparison['BIC'].idxmin()
logit,logit,cloglog = points(cur_min,logit,logit,cloglog)

final = {logit:"logit",logit:"logit",cloglog:"cloglog"}

print(f"The model with the best fit is the {final.get(max(final))} model.")

The model with the best fit is the cloglog model.
