In [153]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

hrsWave = pd.read_csv("../Data/hrsWaveCleaned.csv")

# n = 1000
# hhidpn = np.random.choice(hrsWave["HHIDPN"].unique(), size = n)
# idx = hrsWave["HHIDPN"].isin(hhidpn)
# df = hrsWave.loc[idx, :]

gss = GroupShuffleSplit(n_splits=2, test_size = 0.3)
groupVar = hrsWave["HHIDPN"]
# gss.split(X = hrsWave, groups=groupVar)
shuffle_1, shuffle_2 = gss.split(X = hrsWave, groups=groupVar)
trainIndex, testIndex = shuffle_1
print(trainIndex)
print(testIndex)

## Subset Training and Test Set
df = hrsWave.iloc[trainIndex, :]
testSet = hrsWave.iloc[testIndex, :]

# 1. Define the Binomial family and logit link
# The `Binomial` family in statsmodels assumes endog is proportions (e.g., successes/n_trials)
# or a 2-column array where col 0 is successes and col 1 is failures.
# Since your score is 0-20, you should pass it as a two-column array: [recall_score, 20 - recall_score]


# Create a 2-column array for endog if using the formula API with a non-standard endog
# For GEE with Binomial, endog usually expects a proportion (successes/n_trials) or a (successes, total_trials) tuple/array
# If using `smf.gee`, it's often more straightforward to define `endog` as proportion.
n_trials = 20
df['RwRecProp'] = df['RwTR20'] / n_trials
testSet['RwRecProp'] = testSet['RwTR20'] / n_trials

[     0      1      2 ... 130684 130685 130686]
[     9     10     11 ... 130676 130677 130678]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RwRecProp'] = df['RwTR20'] / n_trials
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testSet['RwRecProp'] = testSet['RwTR20'] / n_trials


In [160]:
print(df.shape)
print(testSet.shape)

(91354, 47)
(39333, 47)


## 1. Fitting the Base Model

In [154]:
def fitBaseModel(formula, groups, df, covStruct, family):
    model = smf.gee(formula, groups=groups, data=df,
                    cov_struct=covStruct,
                    family=family).fit(cov_type = "robust")
    
    print(model.summary())
    

    return model

# 2. Define the exchangeable correlation structure
exchangeable_corr = sm.cov_struct.Exchangeable()
autoregress_corr = sm.cov_struct.Autoregressive()
indep_corr = sm.cov_struct.Independence()


# 3. Specify Full Model Formula
formulaBase = "RwRecProp ~ RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired'))"

# 4. Fit the model with robust covariance
resultsExch = fitBaseModel(formulaBase, "HHIDPN", df, exchangeable_corr, sm.families.Binomial())
resultsAR = fitBaseModel(formulaBase, "HHIDPN", df, autoregress_corr, sm.families.Binomial())
resultsIndep = fitBaseModel(formulaBase, "HHIDPN", df, indep_corr, sm.families.Binomial())



                               GEE Regression Results                              
Dep. Variable:                   RwRecProp   No. Observations:                81139
Model:                                 GEE   No. clusters:                    18626
Method:                        Generalized   Min. cluster size:                   1
                      Estimating Equations   Max. cluster size:                  11
Family:                           Binomial   Mean cluster size:                 4.4
Dependence structure:         Exchangeable   Num. iterations:                    10
Date:                     Sat, 28 Jun 2025   Scale:                           1.000
Covariance type:                    robust   Time:                         00:22:00
                                                                                               coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------

## 2. Variable Selection by QIC:

In [162]:
# Calculate QIC for each model as a measure of goodness of fit
def calc_qic(model_result):
    # QIC = -2 * quasi-likelihood + 2 * trace(V_hat^-1 * V_model)
    # statsmodels does not provide QIC directly, but we can use qic() if available
    try:
        return model_result.qic()[0]
    except AttributeError:
        return np.nan

qic_exch = calc_qic(resultsExch)
qic_ar = calc_qic(resultsAR)
qic_indep = calc_qic(resultsIndep)

print(f"QIC (Exchangeable): {qic_exch}")
print(f"QIC (Autoregressive): {qic_ar}")
print(f"QIC (Unstructured): {qic_indep}")

# The base model with independence covariance structure yields the lowest QIC, marking the best performing base model.



QIC (Exchangeable): 10281.501623452408
QIC (Autoregressive): 10271.300825021772
QIC (Unstructured): 10108.640708059247


In [156]:
import gc
gc.collect()

369

In [192]:
## Choose the control variables
controlVars = ['RwWORK', 'RwJHOURS', 'RwWGIHR',
       'RwJPHYS', 'RwJLIFT', 'RwJSTRES', 'RwJSTOOP', 'RwJSIGHT', 'RwCENREG',
       'RwMSTAT', 'RwLIVBRO', 'RwHIBP', 'RwDIAB', 'RwCANCR',
       'RwLUNG', 'RwHEART', 'RwSTROK', 'RwPSYCH', 'RwVIGACT', 'RwSMOKEV',
       'RwDRINK', 'RwPhyLim', 'RwCogLim', 'RwAnyCogImp', 
       #'RwLOST', 'RwWANDER', 'RwHALUC', 'RwALONE', 'HwATOTB', 
       # 'HwADEBT', 'HwACHCK','HwAMRTB', 'HwITOT', 
       'RAGENDER', 'RARACEM', 'RAEDYRS', 'RAEVBRN']

baseFormula = "RwRecProp ~ RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired'))"
fullFormula = baseFormula + ' + ' + ' + '.join(controlVars)

In [169]:
import re
import numpy as np

def stepwise_selection_qic_forward(data, groups, cov_struct, family, start_formula, end_formula, verbose=True, qic_threshold=0):
    """
    Perform forward feature selection based on QIC for GEE models.

    Parameters:
        data: pandas.DataFrame
            The dataset containing all variables used in the formulas.
        groups: array-like
            Grouping variable for GEE (e.g., subject or cluster IDs).
        cov_struct: statsmodels.genmod.cov_struct.CovStruct
            Covariance structure for GEE (e.g., Exchangeable, Autoregressive).
        family: statsmodels.genmod.families.Family
            The family object for GEE (e.g., Gaussian, Binomial).
        start_formula: str
            The starting model formula (patsy syntax).
        end_formula: str
            The full model formula (patsy syntax, includes all candidate variables).
        verbose: bool, optional
            If True, prints progress at each step.
        qic_threshold: float, optional
            Minimum QIC improvement required to continue selection.

    Returns:
        best_formula: str, formula of the best model found
        best_result: fitted GEE result object
        history: list of (formula, QIC)
    """
    import statsmodels.formula.api as smf

    def get_terms(formula):
        rhs = formula.split('~')[1]
        terms = [t.strip() for t in re.split(r'\s*\+\s*', rhs) if t.strip() != '']
        terms = [t for t in terms if t != '1']
        return set(terms)

    def build_formula(lhs, terms):
        if not terms:
            return f"{lhs} ~ 1"
        return f"{lhs} ~ {' + '.join(sorted(terms))}"

    def calc_qic(result):
        try:
            return result.qic(scale=1)[0]
        except Exception:
            return np.nan

    lhs = start_formula.split('~')[0].strip()
    start_terms = get_terms(start_formula)
    end_terms = get_terms(end_formula)
    current_terms = set(start_terms)
    history = []

    # Fit initial model
    current_formula = build_formula(lhs, current_terms)
    model = smf.gee(current_formula, groups=groups, data=data, cov_struct=cov_struct, family=family)
    result = model.fit(cov_type="robust")
    best_qic = calc_qic(result)
    best_formula = current_formula
    best_result = result
    history.append((current_formula, best_qic))

    improved = True
    while improved:
        improved = False
        qic_candidates = []
        formulas = []
        term_changes = []
        for term in sorted(end_terms - current_terms):
            new_terms = current_terms | {term}
            formula = build_formula(lhs, new_terms)
            try:
                model = smf.gee(formula, groups=groups, data=data, cov_struct=cov_struct, family=family)
                result = model.fit(cov_type="robust")
                qic = calc_qic(result)
            except Exception:
                qic = np.nan
            qic_candidates.append(qic)
            formulas.append(formula)
            term_changes.append(('add', term))

        if qic_candidates:
            if np.all(np.isnan(qic_candidates)):
                if verbose:
                    print("All QIC candidates are NaN, stopping.")
                break
            min_idx = np.nanargmin(qic_candidates)
            min_qic = qic_candidates[min_idx]
            if (best_qic - min_qic) > qic_threshold:
                improved = True
                best_qic = min_qic
                best_formula = formulas[min_idx]
                action, term = term_changes[min_idx]
                if action == 'add':
                    current_terms.add(term)
                model = smf.gee(best_formula, groups=groups, data=data, cov_struct=cov_struct, family=family)
                best_result = model.fit(cov_type="robust")
                history.append((best_formula, best_qic))
                if verbose:
                    print(f"Step: {action}, QIC: {best_qic:.2f}, Formula: {best_formula}")
            else:
                if verbose:
                    print("No QIC improvement above threshold, stopping.")
        else:
            if verbose:
                print("No candidates left, stopping.")

    return best_formula, best_result, history

In [None]:
bestExchForm, bestExchResult, ExchHistory =\
    stepwise_selection_qic_forward(df, df["HHIDPN"], exchangeable_corr, 
                        sm.families.Binomial(), 
                        baseFormula, fullFormula, 
                        verbose=True, 
                        qic_threshold=10)

bestARForm, bestARResult, ARHistory =\
    stepwise_selection_qic_forward(df, df["HHIDPN"], autoregress_corr, 
                        sm.families.Binomial(), 
                        baseFormula, fullFormula, 
                        verbose=True, 
                        qic_threshold=10)

bestIndepForm, bestIndepResult, IndepHistory =\
    stepwise_selection_qic_forward(df, df["HHIDPN"], indep_corr, 
                        sm.families.Binomial(), 
                        baseFormula, fullFormula, 
                        verbose=False, 
                        qic_threshold=10)

In [164]:
print("Best formula of Exchangeble Covariance:", bestExchForm)
print("Best formula of AR(1) Covariance:", bestARForm)
print("Best formula of Indepedence Covariance:", bestIndepForm)

print("Best qic of Exchangeble Covariance:", ExchHistory[-1][1])
print("Best qic of AR(1) Covariance:", ARHistory[-1][1])
print("Best qic of Indepedence Covariance:", IndepHistory[-1][1])

Best formula of Exchangeble Covariance: RwRecProp ~ RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired')) + RwJLIFT + RwSMOKEV
Best formula of AR(1) Covariance: RwRecProp ~ RAEDYRS + RAGENDER + RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired'))
Best formula of Indepedence Covariance: RwRecProp ~ RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired')) + RwWGIHR
Best qic of Exchangeble Covariance: -408.78824872787925
Best qic of AR(1) Covariance: 476.7440560651559
Best qic of Indepedence Covariance: 87.33868652831293


In [27]:
print("Best formula of Exchangeble Covariance:", bestExchResult.summary().tables[1])
print("Best formula of AR(1) Covariance:", bestARResult.summary().tables[1])
print("Best formula of Indepedence Covariance:", bestIndepResult.summary().tables[1])

                                                                                               coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------------------------------------------------------
Intercept                                                                                    0.3182        nan        nan        nan         nan         nan
C(RwJOCCSD, Treatment(reference='Retired'))[T.Farming/Forestry/Fishing]                     -0.5736        nan        nan        nan         nan         nan
C(RwJOCCSD, Treatment(reference='Retired'))[T.Food/Personal/Service]                         1.0898        nan        nan        nan         nan         nan
C(RwJOCCSD, Treatment(reference='Retired'))[T.Healthcare]                                    0.7448        nan        nan        nan         nan         nan
C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Oc

### 2.2 Manual Variable Selection

In [193]:
# resultsExchFull = fitBaseModel(fullFormula, "HHIDPN", df, exchangeable_corr, sm.families.Binomial())
# print("Exchangeable complete!\n")
# try:
#     resultsARFull = fitBaseModel(fullFormula, "HHIDPN", df, autoregress_corr, sm.families.Binomial())
#     print("AR(1) complete!\n")
# except ValueError:
#     print("Value Error")
resultsIndepFull = fitBaseModel(fullFormula, "HHIDPN", df, indep_corr, sm.families.Binomial())
print("Independent complete!\n")

                               GEE Regression Results                              
Dep. Variable:                   RwRecProp   No. Observations:                13695
Model:                                 GEE   No. clusters:                     6582
Method:                        Generalized   Min. cluster size:                   1
                      Estimating Equations   Max. cluster size:                  11
Family:                           Binomial   Mean cluster size:                 2.1
Dependence structure:         Independence   Num. iterations:                     3
Date:                     Sat, 28 Jun 2025   Scale:                           1.000
Covariance type:                    robust   Time:                         04:04:00
                                                                                               coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------

In [184]:
qic_exch_full = calc_qic(resultsExchFull)
# qic_ar_full = calc_qic(resultsARFull)
qic_indep_full = calc_qic(resultsIndepFull)

print(f"QIC (Exchangeable): {qic_exch_full}")
# print(f"QIC (Autoregressive): {qic_ar_full}")
print(f"QIC (Unstructured): {qic_indep_full}")



QIC (Exchangeable): nan
QIC (Unstructured): 1315.209855127156


In [234]:
import pandas as pd

# Manually enter the summary table as a list of dictionaries
summary_data = [
    {"Variable": "RARACEM[T.White]", "Coef": 0.2279, "P>|z|": 0.000},
    {"Variable": "RARACEM[T.Others]", "Coef": 0.0914, "P>|z|": 0.020},
    {"Variable": "C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations]", "Coef": 0.6047, "P>|z|": 0.037},
    {"Variable": "RwDRINK", "Coef": 0.0299, "P>|z|": 0.027},
    {"Variable": "Intercept", "Coef": 0.3314, "P>|z|": 0.000},
    {"Variable": "RwWORK", "Coef": 0.3314, "P>|z|": 0.000},
    {"Variable": "RAEDYRS", "Coef": 0.0499, "P>|z|": 0.000},
    {"Variable": "RwJPHYS", "Coef": -0.0401, "P>|z|": 0.010},
    {"Variable": "RwVIGACT", "Coef": -0.0471, "P>|z|": 0.000},
    {"Variable": "RwMSTAT", "Coef": -0.0075, "P>|z|": 0.011},
    {"Variable": "RwDIAB", "Coef": -0.0544, "P>|z|": 0.002},
    {"Variable": "RwPhyLim[T.True]", "Coef": -0.0727, "P>|z|": 0.000},
    {"Variable": "RwCogLim[T.True]", "Coef": -0.0882, "P>|z|": 0.000},
    {"Variable": "RAGENDER[T.M]", "Coef": -0.2737, "P>|z|": 0.000},
    {"Variable": "RwAGEM_B", "Coef": -0.0180, "P>|z|": 0.000},
    {"Variable": "RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations]", "Coef": -0.0112, "P>|z|": 0.018},
    {"Variable": "RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Sales]", "Coef": -0.0047, "P>|z|": 0.047},
]

# Create the DataFrame
summary_df = pd.DataFrame(summary_data)

# Display the DataFrame
summary_df


Unnamed: 0,Variable,Coef,P>|z|
0,RARACEM[T.White],0.2279,0.0
1,RARACEM[T.Others],0.0914,0.02
2,"C(RwJOCCSD, Treatment(reference='Retired'))[T....",0.6047,0.037
3,RwDRINK,0.0299,0.027
4,Intercept,0.3314,0.0
5,RwWORK,0.3314,0.0
6,RAEDYRS,0.0499,0.0
7,RwJPHYS,-0.0401,0.01
8,RwVIGACT,-0.0471,0.0
9,RwMSTAT,-0.0075,0.011


In [238]:
# bestExchForm = baseFormula + ' + ' + ' + '.join([])
# bestARForm = baseFormula + ' + ' + ' + '.join([])
bestIndepForm = baseFormula + ' + ' + ' + '.join(["RAEDYRS", "RwVIGACT", 
                                                 "RwMSTAT", "RwDIAB", "RwPhyLim", "RwCogLim", 
                                                 "RAGENDER"])

# bestExchResult = fitBaseModel(bestExchForm, "HHIDPN", df, exchangeable_corr, sm.families.Binomial())
# print("Exchangeable complete!\n")
# bestARResult = fitBaseModel(bestARForm, "HHIDPN", df, autoregress_corr, sm.families.Binomial())
# print("AR(1) complete!\n")
bestIndepResult = fitBaseModel(bestIndepForm, "HHIDPN", df, indep_corr, sm.families.Binomial())
print("Independent complete!\n")

bestIndepResult.summary()

                               GEE Regression Results                              
Dep. Variable:                   RwRecProp   No. Observations:                80985
Model:                                 GEE   No. clusters:                    18582
Method:                        Generalized   Min. cluster size:                   1
                      Estimating Equations   Max. cluster size:                  11
Family:                           Binomial   Mean cluster size:                 4.4
Dependence structure:         Independence   Num. iterations:                     2
Date:                     Sat, 28 Jun 2025   Scale:                           1.000
Covariance type:                    robust   Time:                         04:39:49
                                                                                               coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------

0,1,2,3
Dep. Variable:,RwRecProp,No. Observations:,80985
Model:,GEE,No. clusters:,18582
Method:,Generalized,Min. cluster size:,1
,Estimating Equations,Max. cluster size:,11
Family:,Binomial,Mean cluster size:,4.4
Dependence structure:,Independence,Num. iterations:,2
Date:,"Sat, 28 Jun 2025",Scale:,1.000
Covariance type:,robust,Time:,04:39:50

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.2885,0.036,35.357,0.000,1.217,1.360
"C(RwJOCCSD, Treatment(reference='Retired'))[T.Farming/Forestry/Fishing]",-0.6155,0.271,-2.273,0.023,-1.146,-0.085
"C(RwJOCCSD, Treatment(reference='Retired'))[T.Food/Personal/Service]",-0.8105,0.127,-6.390,0.000,-1.059,-0.562
"C(RwJOCCSD, Treatment(reference='Retired'))[T.Healthcare]",-1.1144,0.264,-4.220,0.000,-1.632,-0.597
"C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations]",-0.1557,0.303,-0.513,0.608,-0.750,0.439
"C(RwJOCCSD, Treatment(reference='Retired'))[T.Management/Clerical/Business]",-0.7243,0.096,-7.547,0.000,-0.912,-0.536
"C(RwJOCCSD, Treatment(reference='Retired'))[T.STEM/Professional/Technical]",-0.3591,0.125,-2.880,0.004,-0.604,-0.115
"C(RwJOCCSD, Treatment(reference='Retired'))[T.Sales]",-0.3214,0.140,-2.288,0.022,-0.597,-0.046
"C(RwJOCCSD, Treatment(reference='Retired'))[T.Skilled Trades/Production/Manual]",-0.8868,0.108,-8.223,0.000,-1.098,-0.675

0,1,2,3
Skew:,0.0546,Kurtosis:,0.1567
Centered skew:,0.0255,Centered kurtosis:,1.0652


In [239]:
bestIndepForm

"RwRecProp ~ RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwVIGACT + RwMSTAT + RwDIAB + RwPhyLim + RwCogLim + RAGENDER"

## 3. Test Error Rate Comparison

In [140]:
from sklearn.metrics import mean_absolute_error
import numpy as np

def getPredError(model, respVar, testSet, maxScore):
    '''
    Get prediction error on the testing set
    '''
    testSet = testSet.reset_index(drop=True)
    pred = model.get_prediction(exog=testSet)
    predicted_prop = pred.predicted_mean
    predicted_value = maxScore * np.asarray(predicted_prop).flatten()
    actual_value = testSet[respVar].to_numpy().flatten()
    
    # Remove rows where either prediction or actual is NaN
    mask = ~np.isnan(predicted_value) & ~np.isnan(actual_value)
    predicted_value = predicted_value[mask]
    actual_value = actual_value[mask]
    
    print("Actual:", actual_value)
    print("Predicted:", predicted_value)
    MAE_loss = mean_absolute_error(actual_value, predicted_value)
    print(MAE_loss)

getPredError(bestARResult, "RwTR20", df, 20)

ValueError: operands could not be broadcast together with shapes (4620,) (4639,) 

## 3. Automated Hypothesis Testing

In [248]:
import re

# ## A List of parameters
param_names = bestIndepResult.params.index.tolist()

# ## Create joint hypothesis tests for "No interactions"
interaction_pattern = re.compile(r"RwAGEM_B:C\(RwJOCCSD,")
interaction_indices = [i for i, name in enumerate(param_names) if interaction_pattern.search(name)]
interaction_matrix = np.zeros((len(interaction_indices), len(param_names)))
for row, idx in enumerate(interaction_indices):
    interaction_matrix[row, idx] = 1

## Create joint hypothesis tests for "No main effect"
main_pattern = re.compile(r"^C\(RwJOCCSD,")
main_indices = [i for i, name in enumerate(param_names) if main_pattern.search(name)]
main_matrix = np.zeros((len(main_indices), len(param_names)))
for row, idx in enumerate(main_indices):
    main_matrix[row, idx] = 1

# ## Create the hypothesis test for STEM superior
# stem_matrix = np.zeros((1, len(param_names)))
# hypo_coef = [-1, -1, -1, -1, -1, 7, -1, -1]
# for j in list(range(0, len(main_indices))):
#     stem_matrix[0, main_indices[j]] = hypo_coef[j]

# ## Create the hypothesis test for Management superior
# mgmt_matrix = np.zeros((1, len(param_names)))
# hypo_coef = [-1, -1, -1, -1, 7, -1, -1, -1]
# for j in list(range(0, len(main_indices))):
#     mgmt_matrix[0, main_indices[j]] = hypo_coef[j]


# ## Create the hypothesis test for food inferior
# food_matrix = np.zeros((1, len(param_names)))
# hypo_coef = [-1, 7, -1, -1, -1, -1, -1, -1]
# for j in list(range(0, len(main_indices))):
#     food_matrix[0, main_indices[j]] = hypo_coef[j]

# ## Create the hypothesis test for farm inferior
# farm_matrix = np.zeros((1, len(param_names)))
# hypo_coef = [7, -1, -1, -1, -1, -1, -1, -1]
# for j in list(range(0, len(main_indices))):
#     farm_matrix[0, main_indices[j]] = hypo_coef[j]


In [240]:
def testSummary(r_matrix, model):
    wald_res = model.wald_test(r_matrix)
    print("Statistic:", wald_res.statistic[0,0])
    print("Degrees of freedom:", wald_res.df_denom)
    print("p-value:", wald_res.pvalue)
    print("Distribution:", wald_res.distribution)

def serialTest(model, varPattern):
    '''
    Test multipe similar null hypothesis independently
    '''
    param_names = model.params.index.tolist()

    # Indices of interested parameters
    main_pattern = re.compile(varPattern)
    main_indices = [i for i, name in enumerate(param_names) if main_pattern.search(name)]

    l = len(main_indices)
    for j in list(range(0, l)):
        ## Hypothesis Coefficients
        hypo_coef = np.full((1, l), -1/l)
        hypo_coef[0, j] = 1

        ## Map hypo_coef to the parameter
        r_matrix = np.zeros((1, len(param_names)))

        # print(hypo_coef[0,1])
        # print(main_indices)
        # print(r_matrix)
        for k in list(range(0, len(main_indices))):
            r_matrix[0, main_indices[k]] = hypo_coef[0,k]

        ## Net effect estimate
        netEffect = (r_matrix @ np.array(model.params))[0]
        print(f"Net effect of {param_names[main_indices[j]]} is {netEffect}")
        
        ## Wald test 
        testSummary(r_matrix, model)
        print("\n\n")

    
# testSummary(interaction_matrix, bestARResult)
serialTest(bestIndepResult, "^C\(RwJOCCSD,")

Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.Farming/Forestry/Fishing] is -0.06894547102285697
Statistic: 0.06263578141075488
Degrees of freedom: 1.0
p-value: 0.8023774603258461
Distribution: chi2



Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.Food/Personal/Service] is -0.2883374708060271
Statistic: 4.221754213518139
Degrees of freedom: 1.0
p-value: 0.039908880767751106
Distribution: chi2



Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.Healthcare] is -0.6302088552187908
Statistic: 5.492568548303178
Degrees of freedom: 1.0
p-value: 0.019097466494387223
Distribution: chi2



Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations] is 0.4482378627486824
Statistic: 2.1270050809326753
Degrees of freedom: 1.0
p-value: 0.1447232827842026
Distribution: chi2



Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.Management/Clerical/Business] is -0.19136888896300053
Statistic: 2.841649852858129
Degrees of freedom: 1.



In [None]:
print("Interaction Effect")
testSummary(interaction_matrix, bestIndepResult)

print("Main Effect")
testSummary(main_matrix, bestIndepResult)

Interaction Effect
Statistic: 183.24279260056883
Degrees of freedom: 8.0
p-value: 2.1452051145196688e-35
Distribution: chi2
Main Effect
Statistic: 162.99028452218164
Degrees of freedom: 8.0
p-value: 3.788229157868425e-31
Distribution: chi2




In [None]:
## Multiple tests on Interaction Terms
serialTest(bestARResult, "^RwAGEM_B:C\(RwJOCCSD,")

Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Farming/Forestry/Fishing] is 0.035807682949691555
Statistic: 1.1138568383756848
Degrees of freedom: 1.0
p-value: 0.29124509153381184
Distribution: chi2



Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Food/Personal/Service] is -0.006149995862626768
Statistic: 0.23417399573963538
Degrees of freedom: 1.0
p-value: 0.6284459975690349
Distribution: chi2



Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Healthcare] is 0.008672970360212976
Statistic: 0.2950677957107403
Degrees of freedom: 1.0
p-value: 0.5869911207533771
Distribution: chi2



Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations] is -0.05018521275236207
Statistic: 1.1111731732317598
Degrees of freedom: 1.0
p-value: 0.29182706885164766
Distribution: chi2



Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Management/Clerical/Business] is 0.0030246022191070523


In [None]:
# Retrieve the covariance matrices of the fitted GEE models
# cov_exch = resultsExch.cov_params()
# cov_ar = resultsAR.cov_params()
# cov_unstruct = resultsUnstruct.cov_params()

# print("Covariance matrix (Exchangeable):\n", cov_exch)
# print("\nCovariance matrix (Autoregressive):\n", cov_ar)
# print("\nCovariance matrix (Unstructured):\n", cov_unstruct)