In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

hrsWave = pd.read_csv("../Data/hrsWaveCleaned.csv")

# n = 1000
# hhidpn = np.random.choice(hrsWave["HHIDPN"].unique(), size = n)
# idx = hrsWave["HHIDPN"].isin(hhidpn)
# df = hrsWave.loc[idx, :]

gss = GroupShuffleSplit(n_splits=2, test_size = 0.3)
groupVar = hrsWave["HHIDPN"]
# gss.split(X = hrsWave, groups=groupVar)
shuffle_1, shuffle_2 = gss.split(X = hrsWave, groups=groupVar)
trainIndex, testIndex = shuffle_1
print(trainIndex)
print(testIndex)

## Subset Training and Test Set
df = hrsWave.iloc[trainIndex, :]
testSet = hrsWave.iloc[testIndex, :]

# 1. Define the Binomial family and logit link
# The `Binomial` family in statsmodels assumes endog is proportions (e.g., successes/n_trials)
# or a 2-column array where col 0 is successes and col 1 is failures.
# Since your score is 0-20, you should pass it as a two-column array: [recall_score, 20 - recall_score]


# Create a 2-column array for endog if using the formula API with a non-standard endog
# For GEE with Binomial, endog usually expects a proportion (successes/n_trials) or a (successes, total_trials) tuple/array
# If using `smf.gee`, it's often more straightforward to define `endog` as proportion.
n_trials = 20
df['RwRecProp'] = df['RwTR20'] / n_trials
testSet['RwRecProp'] = testSet['RwTR20'] / n_trials

[     0      1      2 ... 130682 130685 130686]
[    23     38     39 ... 130646 130683 130684]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RwRecProp'] = df['RwTR20'] / n_trials
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testSet['RwRecProp'] = testSet['RwTR20'] / n_trials


In [2]:
print(df.shape)
print(testSet.shape)

(91356, 47)
(39331, 47)


In [125]:
df.to_csv("../Data/hrsTrain.csv")
testSet.to_csv("../Data/hrsTest.csv")

## 1. Fitting the Base Model

In [None]:
def fitBaseModel(formula, groups, df, covStruct, family):
    model = smf.gee(formula, groups=groups, data=df,
                    cov_struct=covStruct,
                    family=family).fit(cov_type = "robust")
    
    print(model.summary())
    
    return model

# 2. Define the exchangeable correlation structure
exchangeable_corr = sm.cov_struct.Exchangeable()
autoregress_corr = sm.cov_struct.Autoregressive()
indep_corr = sm.cov_struct.Independence()


# 3. Specify Full Model Formula
formulaBase = "RwRecProp ~ RwAGEM_B + C(RwJOCCSD, Treatment(reference='Retired'))"

# 4. Fit the model with robust covariance
resultsExch = fitBaseModel(formulaBase, "HHIDPN", df, exchangeable_corr, sm.families.Binomial())
resultsAR = fitBaseModel(formulaBase, "HHIDPN", df, autoregress_corr, sm.families.Binomial())
resultsIndep = fitBaseModel(formulaBase, "HHIDPN", df, indep_corr, sm.families.Binomial())



                               GEE Regression Results                              
Dep. Variable:                   RwRecProp   No. Observations:                81174
Model:                                 GEE   No. clusters:                    18613
Method:                        Generalized   Min. cluster size:                   1
                      Estimating Equations   Max. cluster size:                  11
Family:                           Binomial   Mean cluster size:                 4.4
Dependence structure:         Exchangeable   Num. iterations:                    10
Date:                     Thu, 03 Jul 2025   Scale:                           1.000
Covariance type:                    robust   Time:                         03:27:48
                                                                                      coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------

### 1.2 Test VIF Calculation

In [None]:
# print(resultsExch.model.exog[1:6,:])
# print(len(resultsExch.model.exog_names))
# print(resultsExch.model.exog.shape)

# print(X_design_df)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import patsy

X_mat = resultsExch.model.exog
X_names = resultsExch.model.exog_names
X_design_df = pd.DataFrame(X_mat, columns = X_names)
print("Design Matrix (X) columns:", X_design_df.columns.tolist())

vif_threshold = 10

# intercept_column_name = 'Intercept' # Default name by patsy/statsmodels
# if intercept_column_name in X_design_df.columns:
#     X_for_vif = X_design_df.drop(intercept_column_name, axis=1)
#     print(f"Dropped '{intercept_column_name}' for VIF calculation.")
# else:
#     print(f"Warning: Could not find column named '{intercept_column_name}' to drop for VIF.")

vif_data = pd.DataFrame()
vif_data["features"] = X_design_df.columns
vif_data["VIF"] = [variance_inflation_factor(X_design_df.values, i) for i in range(X_design_df.shape[1])]

print(vif_data)

Design Matrix (X) columns: ['Intercept', "C(RwJOCCSD, Treatment(reference='Retired'))[T.Farming/Forestry/Fishing]", "C(RwJOCCSD, Treatment(reference='Retired'))[T.Food/Personal/Service]", "C(RwJOCCSD, Treatment(reference='Retired'))[T.Healthcare]", "C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations]", "C(RwJOCCSD, Treatment(reference='Retired'))[T.Management/Clerical/Business]", "C(RwJOCCSD, Treatment(reference='Retired'))[T.STEM/Professional/Technical]", "C(RwJOCCSD, Treatment(reference='Retired'))[T.Sales]", "C(RwJOCCSD, Treatment(reference='Retired'))[T.Skilled Trades/Production/Manual]", 'RwAGEM_B']
                                            features        VIF
0                                          Intercept  81.368272
1  C(RwJOCCSD, Treatment(reference='Retired'))[T....   1.005279
2  C(RwJOCCSD, Treatment(reference='Retired'))[T....   1.033572
3  C(RwJOCCSD, Treatment(reference='Retired'))[T....   1.012888
4  C(RwJOCCSD, Treatment(reference='Retired'))[T...

In [84]:
import re

# [var for var in indepVars if var not in df.columns.to_list()]
# Extract the string that precedes '[T.' in the given variable name
# var_name = 'RwLIVBRO'
# match = re.match(r"(.+)\[T\.", var_name)
# print(match)
# if match:
#     preceding_str = match.group(1)
#     print(preceding_str)
# else:
#     print("No match found.")


indepVars = [
    'RwAGEM_B', "RwJOCCSD",
    'RwJHOURS', 'RwWGIHR']

indepVars.remove('RwAGEM_B')
print(indepVars)

['RwJOCCSD', 'RwJHOURS', 'RwWGIHR']


In [99]:
def vif_elimination(df, indepVars, depVar, varsToKeep, threshold):
    ''' 
    Calculate variance inflation factor (VIF) of each independent variables, 
    Remove those with VIF that exceed the threshold

    Returns:
    A list of variables whose VIF are less than the pre-specified threshold
    '''
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    import patsy
    
    X_design_df = patsy.dmatrices(
                formula_like = depVar + "~" + ' + '.join(indepVars),
                data = df, 
                return_type="dataframe")[1]
    
    columns_to_keep = []
    for var in varsToKeep:
        # For categorical variables, patsy expands them with "C(var, ...)[T.level]" or similar
        # So we check if the column name starts with the variable name or matches exactly
        columns_to_keep.extend([col for col in X_design_df.columns if col == var or col.startswith(f"C({var},") or col.startswith(f"{var}[T.")])
    print("Columns to be kept for indepVars:", columns_to_keep)

    multicolinear = True

    while multicolinear:

        ## Calculate VIF for each variable and dummy variable
        vif_data = pd.DataFrame()
        vif_data["features"] = X_design_df.columns
        vif_data["VIF"] = [variance_inflation_factor(X_design_df.values, i) for i in range(X_design_df.shape[1])]
        

        vif_data.set_index("features", inplace=True)
        # print(vif_data)
        ## Sort variables by VIF values, accept for variables to keep
        vif_ranking = vif_data.drop(index= columns_to_keep, axis = 0).sort_values(by = "VIF", ascending=False)
        # print(vif_ranking)
        ## Isolate the variable with the highest variance
        inflatorIdx = vif_ranking.index[0]
        # print(inflatorIdx)
        # print(vif_ranking.iloc[0, 0] <= threshold)
        if vif_ranking.iloc[0, 0] >= threshold:
            ## Get inflator variable full name
            # print(inflatorIdx)
            
            ## Get the truncated name
            try:
                match = re.match(r"(.+)\[T\.", inflatorIdx)
                varInflator = match.group(1)
                indepVars.remove(varInflator)
                print(f"Removed {varInflator}")
            except:
                print("Fine, not > 2 categorical values")
                ## Remove the top inflator above the threshold
                indepVars.remove(inflatorIdx)
                print(f"Removed {inflatorIdx}")

            ## Update design matrix
            # Get the list of column names in the design matrix that correspond to indepVars (including expanded categorical variables)
            # This will match the base variable name or its dummy expansion
            X_design_df = patsy.dmatrices(
                formula_like = depVar + "~" + ' + '.join(indepVars),
                data = df, 
                return_type="dataframe")[1]
            print(f"Current predictor list: {indepVars}")
        else:
            print("All VIFs are under the threshold!")
            break

    print(vif_ranking)
    return indepVars

test_df = df.head(500)
indepVars = [
    'RwAGEM_B', "RwJOCCSD",
    'RwJHOURS', 'RwWGIHR',
    'RwJPHYS',  'RwJSTRES', 'RwJSIGHT', 'RwCENREG',
    'RwMSTAT', 'RwLIVBRO', 'RwHIBP', 'RwDIAB', 'RwCANCR',
    'RwLUNG', 'RwHEART', 'RwSTROK', 'RwPSYCH', 'RwVIGACT', 'RwSMOKEV',
    'RwDRINK', 'RwPhyLim', 'RwCogLim', 'RwJLIFT', 'RwJSTOOP',
    # 'RwLOST', 'RwWANDER', 'RwHALUC', 'RwALONE', 'RwAnyCogImp', 
    'HwATOTB', 
    'HwADEBT', 'HwACHCK','HwAMRTB', 'HwITOT', 
    'RAGENDER', 'RARACEM', 'RAEDYRS', 'RAEVBRN']
depVar = "RwTR20"
varsToKeep = ['Intercept', 'RwAGEM_B', "RwJOCCSD", "RAEDYRS"]
resultVars = vif_elimination(df, indepVars, depVar, varsToKeep, 5)
print(resultVars)

Columns to be kept for indepVars: ['Intercept', 'RwAGEM_B', 'RwJOCCSD[T.Food/Personal/Service]', 'RwJOCCSD[T.Healthcare]', 'RwJOCCSD[T.High Risk Occupations]', 'RwJOCCSD[T.Management/Clerical/Business]', 'RwJOCCSD[T.Retired]', 'RwJOCCSD[T.STEM/Professional/Technical]', 'RwJOCCSD[T.Sales]', 'RwJOCCSD[T.Skilled Trades/Production/Manual]', 'RAEDYRS']


  return 1 - self.ssr/self.centered_tss


All VIFs are under the threshold!
                        VIF
features                   
HwATOTB            1.754046
HwITOT             1.598110
RwJPHYS            1.552152
HwACHCK            1.541683
RAGENDER[T.M]      1.493098
RARACEM[T.White]   1.483961
RwJSTOOP           1.462164
RwJLIFT            1.444136
RwJHOURS           1.344055
RARACEM[T.Others]  1.326010
RwMSTAT            1.323283
RwJSTRES           1.206301
RwCogLim[T.True]   1.143392
RwDRINK            1.125985
RwSMOKEV           1.110009
RwHEART            1.105007
RAEVBRN            1.094125
RwHIBP             1.093638
RwDIAB             1.076150
RwPhyLim[T.True]   1.074567
RwLIVBRO[T.3--8]   1.062533
RwPSYCH            1.062398
RwVIGACT           1.058559
RwLUNG             1.053380
RwJSIGHT           1.048367
HwADEBT            1.043102
RwSTROK            1.039314
RwCANCR            1.037396
HwAMRTB            1.036789
RwCENREG           1.033807
RwLIVBRO[T.9--14]  1.019082
RwWGIHR            1.009383
['RwAGEM_B', '

In [100]:
cols = [
    "RwJLIFT", "RwJSTOOP", "RwLUNG", "RwAnyCogImp",
    "RwLOST", "RwWANDER", "RwHALUC", "RwALONE", "HwADEBT"
]
missing_pct =df[cols].isna().mean() * 100
print(missing_pct)

RwJLIFT        80.653706
RwJSTOOP       80.660274
RwLUNG          0.000000
RwAnyCogImp     0.000000
RwLOST          0.000000
RwWANDER        0.000000
RwHALUC         0.000000
RwALONE         0.000000
HwADEBT         0.000000
dtype: float64


## 2. Variable Selection by QIC:

In [114]:
# Calculate QIC for each model as a measure of goodness of fit
def calc_qic(model_result):
    # QIC = -2 * quasi-likelihood + 2 * trace(V_hat^-1 * V_model)
    # statsmodels does not provide QIC directly, but we can use qic() if available
    try:
        return model_result.qic()[0]
    except AttributeError:
        return np.nan

qic_exch = calc_qic(resultsExch)
qic_ar = calc_qic(resultsAR)
qic_indep = calc_qic(resultsIndep)

print(f"QIC (Exchangeable): {qic_exch}")
print(f"QIC (Autoregressive): {qic_ar}")
print(f"QIC (Unstructured): {qic_indep}")

# The base model with independence covariance structure yields the lowest QIC, marking the best performing base model.



QIC (Exchangeable): 10289.840917145659
QIC (Autoregressive): 10285.401940937447
QIC (Unstructured): 10120.574391687638


In [101]:
import gc
gc.collect()

0

In [126]:
fullFormula

"RwRecProp ~ RwAGEM_B + RAEDYRS + C(RwJOCCSD, Treatment(reference='Retired')) + RwCENREG + RwLUNG + RARACEM + RwJLIFT + RwPhyLim + RAEVBRN + RwSMOKEV + RwJPHYS + RwJHOURS + RwLIVBRO + RwJSIGHT + RwDRINK + RwCogLim + RwMSTAT + RwSTROK + RwCANCR + RwJSTRES + HwITOT + RAGENDER + RwHIBP + RwWGIHR + RwJSTOOP + HwAMRTB + HwACHCK + RwVIGACT + HwATOTB + RwPSYCH + HwADEBT + RwDIAB + RwHEART"

In [115]:
import re
import numpy as np

def stepwise_selection_qic_forward(data, groups, cov_struct, family, start_formula, end_formula, verbose=True, qic_threshold=0):
    """
    Perform forward feature selection based on QIC for GEE models.

    Parameters:
        data: pandas.DataFrame
            The dataset containing all variables used in the formulas.
        groups: array-like
            Grouping variable for GEE (e.g., subject or cluster IDs).
        cov_struct: statsmodels.genmod.cov_struct.CovStruct
            Covariance structure for GEE (e.g., Exchangeable, Autoregressive).
        family: statsmodels.genmod.families.Family
            The family object for GEE (e.g., Gaussian, Binomial).
        start_formula: str
            The starting model formula (patsy syntax).
        end_formula: str
            The full model formula (patsy syntax, includes all candidate variables).
        verbose: bool, optional
            If True, prints progress at each step.
        qic_threshold: float, optional
            Minimum QIC improvement required to continue selection.

    Returns:
        best_formula: str, formula of the best model found
        best_result: fitted GEE result object
        history: list of (formula, QIC)
    """
    import statsmodels.formula.api as smf

    def get_terms(formula):
        rhs = formula.split('~')[1]
        terms = [t.strip() for t in re.split(r'\s*\+\s*', rhs) if t.strip() != '']
        terms = [t for t in terms if t != '1']
        return set(terms)

    def build_formula(lhs, terms):
        if not terms:
            return f"{lhs} ~ 1"
        return f"{lhs} ~ {' + '.join(sorted(terms))}"

    def calc_qic(result):
        try:
            return result.qic(scale=1)[0]
        except Exception:
            return np.nan

    lhs = start_formula.split('~')[0].strip()
    start_terms = get_terms(start_formula)
    end_terms = get_terms(end_formula)
    current_terms = set(start_terms)
    history = []

    # Fit initial model
    current_formula = build_formula(lhs, current_terms)
    model = smf.gee(current_formula, groups=groups, data=data, cov_struct=cov_struct, family=family)
    result = model.fit(cov_type="robust")
    best_qic = calc_qic(result)
    best_formula = current_formula
    best_result = result
    history.append((current_formula, best_qic))

    improved = True
    while improved:
        improved = False
        qic_candidates = []
        formulas = []
        term_changes = []
        for term in sorted(end_terms - current_terms):
            new_terms = current_terms | {term}
            formula = build_formula(lhs, new_terms)
            try:
                model = smf.gee(formula, groups=groups, data=data, cov_struct=cov_struct, family=family)
                result = model.fit(cov_type="robust")
                qic = calc_qic(result)
            except Exception:
                qic = np.nan
            qic_candidates.append(qic)
            formulas.append(formula)
            term_changes.append(('add', term))

            print(f"Candidate Term: {term}\n")
            print(f"Candidate Term's QIC: {qic}\n")
            print(f"Candidate formula: {formula}\n")

            

        if qic_candidates:
            if np.all(np.isnan(qic_candidates)):
                if verbose:
                    print("All QIC candidates are NaN, stopping.")
                break
            min_idx = np.nanargmin(qic_candidates)
            min_qic = qic_candidates[min_idx]
            if (best_qic - min_qic) > qic_threshold:
                improved = True
                best_qic = min_qic
                best_formula = formulas[min_idx]
                action, term = term_changes[min_idx]
                
                current_terms.add(term)
                history.append((best_formula, best_qic))
                
                if verbose:
                    print(f"Step: {action}, QIC: {best_qic:.2f}, Formula: {best_formula}")
            else:
                if verbose:
                    print("No QIC improvement above threshold, stopping.")
        else:
            if verbose:
                print("No candidates left, stopping.")
    
    model = smf.gee(best_formula, groups=groups, data=data, cov_struct=cov_struct, family=family)
    best_result = model.fit(cov_type="robust")
    return best_formula, best_result, history

In [None]:
controlVars = set(resultVars)
controlVars = controlVars - set(['RwAGEM_B', 'RwJOCCSD', 'RAEDYRS'])

baseFormula = "RwRecProp ~ RwAGEM_B + RAEDYRS + C(RwJOCCSD, Treatment(reference='Retired'))"
fullFormula = baseFormula +  ' + ' + ' + '.join(controlVars)


465.0

In [None]:
exchangeable_corr = sm.cov_struct.Exchangeable()
bestExchForm, bestExchResult, ExchHistory =\
    stepwise_selection_qic_forward(df, df["HHIDPN"], exchangeable_corr, 
                        sm.families.Binomial(), 
                        baseFormula, fullFormula, 
                        verbose=True, 
                        qic_threshold=10)

# bestARForm, bestARResult, ARHistory =\
#     stepwise_selection_qic_forward(df, df["HHIDPN"], autoregress_corr, 
#                         sm.families.Binomial(), 
#                         baseFormula, fullFormula, 
#                         verbose=True, 
#                         qic_threshold=10)

# bestIndepForm, bestIndepResult, IndepHistory =\
#     stepwise_selection_qic_forward(df, df["HHIDPN"], indep_corr, 
#                         sm.families.Binomial(), 
#                         baseFormula, fullFormula, 
#                         verbose=False, 
#                         qic_threshold=10)

Candidate Term: HwACHCK

Candidate Term's QIC: 9165.966348275875

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + HwACHCK + RAEDYRS + RwAGEM_B

Candidate Term: HwADEBT

Candidate Term's QIC: 9319.731277866787

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + HwADEBT + RAEDYRS + RwAGEM_B

Candidate Term: HwAMRTB

Candidate Term's QIC: 9318.416394591974

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + HwAMRTB + RAEDYRS + RwAGEM_B

Candidate Term: HwATOTB

Candidate Term's QIC: 8796.45196097415

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + HwATOTB + RAEDYRS + RwAGEM_B

Candidate Term: HwITOT

Candidate Term's QIC: 9259.990945258314

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + HwITOT + RAEDYRS + RwAGEM_B

Candidate Term: RAEVBRN

Candidate Term's QIC: 9296.852687156708

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired'



Candidate Term: RwJOCCSD

Candidate Term's QIC: -144338.8861388969

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJOCCSD + RwWGIHR

Candidate Term: RwJPHYS

Candidate Term's QIC: 1394.4168754699424

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJPHYS + RwWGIHR

Candidate Term: RwJSIGHT

Candidate Term's QIC: 1401.7289885543196

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJSIGHT + RwWGIHR

Candidate Term: RwJSTOOP

Candidate Term's QIC: 1395.0656155753152

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJSTOOP + RwWGIHR

Candidate Term: RwJSTRES

Candidate Term's QIC: 1413.6355663132515

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJSTRES + RwWGIHR

Candidate Term: RwLIVBRO

Candidate Term's QIC: 1521.452981799016

Candi



Candidate Term: HwAMRTB

Candidate Term's QIC: 1569.6424648087848

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + HwAMRTB + RAEDYRS + RwAGEM_B + RwJOCCSD + RwWGIHR

Candidate Term: HwATOTB

Candidate Term's QIC: 1302.6848419570138

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + HwATOTB + RAEDYRS + RwAGEM_B + RwJOCCSD + RwWGIHR

Candidate Term: HwITOT

Candidate Term's QIC: 1535.4777353480324

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + HwITOT + RAEDYRS + RwAGEM_B + RwJOCCSD + RwWGIHR





Candidate Term: RAEVBRN

Candidate Term's QIC: 1480.0484738447685

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RAEVBRN + RwAGEM_B + RwJOCCSD + RwWGIHR

Candidate Term: RAGENDER

Candidate Term's QIC: 1392.828777681813

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RAGENDER + RwAGEM_B + RwJOCCSD + RwWGIHR

Candidate Term: RARACEM

Candidate Term's QIC: 1977.462196755634

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RARACEM + RwAGEM_B + RwJOCCSD + RwWGIHR

Candidate Term: RwCANCR

Candidate Term's QIC: 1487.3279392904385

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwCANCR + RwJOCCSD + RwWGIHR

Candidate Term: RwCENREG

Candidate Term's QIC: 2446.112624091543

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwCENREG + RwJOCCSD + RwWGIHR

Candidate Term: RwCogLim



Candidate Term: RwDRINK

Candidate Term's QIC: 2330.43569504728

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwDRINK + RwJOCCSD + RwWGIHR

Candidate Term: RwHEART

Candidate Term's QIC: 1512.9480015730978

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwHEART + RwJOCCSD + RwWGIHR





Candidate Term: RwHIBP

Candidate Term's QIC: 2144.461324923861

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwHIBP + RwJOCCSD + RwWGIHR

Candidate Term: RwJHOURS

Candidate Term's QIC: 1420.5126698694462

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJHOURS + RwJOCCSD + RwWGIHR

Candidate Term: RwJLIFT

Candidate Term's QIC: 1395.463916881579

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJLIFT + RwJOCCSD + RwWGIHR

Candidate Term: RwJPHYS

Candidate Term's QIC: 1394.4432101097987

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJOCCSD + RwJPHYS + RwWGIHR

Candidate Term: RwJSIGHT

Candidate Term's QIC: 1404.8216768789089

Candidate formula: RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJOCCSD + RwJSIGHT + RwWGIHR

Candidate Term: RwJSTOOP


KeyboardInterrupt: 

In [117]:
autoregress_corr = sm.cov_struct.Autoregressive()
bestARForm, bestARResult, ARHistory =\
    stepwise_selection_qic_forward(df, df["HHIDPN"], autoregress_corr, 
                        sm.families.Binomial(), 
                        baseFormula, fullFormula, 
                        verbose=True, 
                        qic_threshold=10)
indep_corr = sm.cov_struct.Independence()
bestIndepForm, bestIndepResult, IndepHistory =\
    stepwise_selection_qic_forward(df, df["HHIDPN"], indep_corr, 
                        sm.families.Binomial(), 
                        baseFormula, fullFormula, 
                        verbose=False, 
                        qic_threshold=10)



KeyboardInterrupt: 

In [164]:
print("Best formula of Exchangeble Covariance:", bestExchForm)
print("Best formula of AR(1) Covariance:", bestARForm)
print("Best formula of Indepedence Covariance:", bestIndepForm)

print("Best qic of Exchangeble Covariance:", ExchHistory[-1][1])
print("Best qic of AR(1) Covariance:", ARHistory[-1][1])
print("Best qic of Indepedence Covariance:", IndepHistory[-1][1])

Best formula of Exchangeble Covariance: RwRecProp ~ RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired')) + RwJLIFT + RwSMOKEV
Best formula of AR(1) Covariance: RwRecProp ~ RAEDYRS + RAGENDER + RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired'))
Best formula of Indepedence Covariance: RwRecProp ~ RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired')) + RwWGIHR
Best qic of Exchangeble Covariance: -408.78824872787925
Best qic of AR(1) Covariance: 476.7440560651559
Best qic of Indepedence Covariance: 87.33868652831293


In [27]:
print("Best formula of Exchangeble Covariance:", bestExchResult.summary().tables[1])
print("Best formula of AR(1) Covariance:", bestARResult.summary().tables[1])
print("Best formula of Indepedence Covariance:", bestIndepResult.summary().tables[1])

                                                                                               coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------------------------------------------------------
Intercept                                                                                    0.3182        nan        nan        nan         nan         nan
C(RwJOCCSD, Treatment(reference='Retired'))[T.Farming/Forestry/Fishing]                     -0.5736        nan        nan        nan         nan         nan
C(RwJOCCSD, Treatment(reference='Retired'))[T.Food/Personal/Service]                         1.0898        nan        nan        nan         nan         nan
C(RwJOCCSD, Treatment(reference='Retired'))[T.Healthcare]                                    0.7448        nan        nan        nan         nan         nan
C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Oc

In [113]:
bestExchForm = "RwRecProp ~ C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwAGEM_B + RwJLIFT + RwWGIHR"

### 2.2 Manual Variable Selection

In [193]:
# resultsExchFull = fitBaseModel(fullFormula, "HHIDPN", df, exchangeable_corr, sm.families.Binomial())
# print("Exchangeable complete!\n")
# try:
#     resultsARFull = fitBaseModel(fullFormula, "HHIDPN", df, autoregress_corr, sm.families.Binomial())
#     print("AR(1) complete!\n")
# except ValueError:
#     print("Value Error")
resultsIndepFull = fitBaseModel(fullFormula, "HHIDPN", df, indep_corr, sm.families.Binomial())
print("Independent complete!\n")

                               GEE Regression Results                              
Dep. Variable:                   RwRecProp   No. Observations:                13695
Model:                                 GEE   No. clusters:                     6582
Method:                        Generalized   Min. cluster size:                   1
                      Estimating Equations   Max. cluster size:                  11
Family:                           Binomial   Mean cluster size:                 2.1
Dependence structure:         Independence   Num. iterations:                     3
Date:                     Sat, 28 Jun 2025   Scale:                           1.000
Covariance type:                    robust   Time:                         04:04:00
                                                                                               coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------

In [184]:
qic_exch_full = calc_qic(resultsExchFull)
# qic_ar_full = calc_qic(resultsARFull)
qic_indep_full = calc_qic(resultsIndepFull)

print(f"QIC (Exchangeable): {qic_exch_full}")
# print(f"QIC (Autoregressive): {qic_ar_full}")
print(f"QIC (Unstructured): {qic_indep_full}")



QIC (Exchangeable): nan
QIC (Unstructured): 1315.209855127156


In [None]:
import pandas as pd

# Manually enter the summary table as a list of dictionaries
summary_data = [
    {"Variable": "RARACEM[T.White]", "Coef": 0.2279, "P>|z|": 0.000},
    {"Variable": "RARACEM[T.Others]", "Coef": 0.0914, "P>|z|": 0.020},
    {"Variable": "C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations]", "Coef": 0.6047, "P>|z|": 0.037},
    {"Variable": "RwDRINK", "Coef": 0.0299, "P>|z|": 0.027},
    {"Variable": "Intercept", "Coef": 0.3314, "P>|z|": 0.000},
    {"Variable": "RwWORK", "Coef": 0.3314, "P>|z|": 0.000},
    {"Variable": "RAEDYRS", "Coef": 0.0499, "P>|z|": 0.000},
    {"Variable": "RwJPHYS", "Coef": -0.0401, "P>|z|": 0.010},
    {"Variable": "RwVIGACT", "Coef": -0.0471, "P>|z|": 0.000},
    {"Variable": "RwMSTAT", "Coef": -0.0075, "P>|z|": 0.011},
    {"Variable": "RwDIAB", "Coef": -0.0544, "P>|z|": 0.002},
    {"Variable": "RwPhyLim[T.True]", "Coef": -0.0727, "P>|z|": 0.000},
    {"Variable": "RwCogLim[T.True]", "Coef": -0.0882, "P>|z|": 0.000},
    {"Variable": "RAGENDER[T.M]", "Coef": -0.2737, "P>|z|": 0.000},
    {"Variable": "RwAGEM_B", "Coef": -0.0180, "P>|z|": 0.000},
    {"Variable": "RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations]", "Coef": -0.0112, "P>|z|": 0.018},
    {"Variable": "RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Sales]", "Coef": -0.0047, "P>|z|": 0.047},
]

# Create the DataFrame
summary_df = pd.DataFrame(summary_data)

# Display the DataFrame
summary_df


Unnamed: 0,Variable,Coef,P>|z|
0,RARACEM[T.White],0.2279,0.0
1,RARACEM[T.Others],0.0914,0.02
2,"C(RwJOCCSD, Treatment(reference='Retired'))[T....",0.6047,0.037
3,RwDRINK,0.0299,0.027
4,Intercept,0.3314,0.0
5,RwWORK,0.3314,0.0
6,RAEDYRS,0.0499,0.0
7,RwJPHYS,-0.0401,0.01
8,RwVIGACT,-0.0471,0.0
9,RwMSTAT,-0.0075,0.011


In [257]:
# bestExchForm = baseFormula + ' + ' + ' + '.join([])
# bestARForm = baseFormula + ' + ' + ' + '.join([])
bestIndepForm = baseFormula + ' + ' + ' + '.join(["RARACEM", "RAEDYRS", "RwVIGACT", "RwDRINK", 
                                                 "RwMSTAT", "RwDIAB", "RwPhyLim", "RwCogLim", 
                                                 "RAGENDER"])

# bestExchResult = fitBaseModel(bestExchForm, "HHIDPN", df, exchangeable_corr, sm.families.Binomial())
# print("Exchangeable complete!\n")
# bestARResult = fitBaseModel(bestARForm, "HHIDPN", df, autoregress_corr, sm.families.Binomial())
# print("AR(1) complete!\n")
bestIndepResult = fitBaseModel(bestIndepForm, "HHIDPN", df, indep_corr, sm.families.Binomial())
print("Independent complete!\n")

# bestIndepResult.summary()

                               GEE Regression Results                              
Dep. Variable:                   RwRecProp   No. Observations:                80942
Model:                                 GEE   No. clusters:                    18555
Method:                        Generalized   Min. cluster size:                   1
                      Estimating Equations   Max. cluster size:                  11
Family:                           Binomial   Mean cluster size:                 4.4
Dependence structure:         Independence   Num. iterations:                     2
Date:                     Sat, 28 Jun 2025   Scale:                           1.000
Covariance type:                    robust   Time:                         07:53:12
                                                                                               coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------

In [239]:
bestIndepForm

"RwRecProp ~ RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired')) + RAEDYRS + RwVIGACT + RwMSTAT + RwDIAB + RwPhyLim + RwCogLim + RAGENDER"

## 3. Test Error Rate Comparison

In [256]:
from sklearn.metrics import mean_absolute_error
import numpy as np

def getPredError(model, respVar, testSet, maxScore):
    '''
    Get prediction error on the testing set
    '''
    testSet = testSet.reset_index(drop=True)
    pred = model.get_prediction(exog=testSet)
    predicted_prop = pred.predicted_mean
    predicted_value = maxScore * np.asarray(predicted_prop).flatten()
    actual_value = testSet[respVar]
    
    # Remove rows where either prediction or actual is NaN
    mask = ~np.isnan(predicted_value) & ~np.isnan(actual_value)
    predicted_value = predicted_value[mask]
    actual_value = actual_value[mask]
    
    print("Actual:", actual_value)
    print("Predicted:", predicted_value)
    MAE_loss = mean_absolute_error(actual_value, predicted_value*dropna())
    print(MAE_loss)

getPredError(bestIndepResult, "RwTR20", df, 20)

ValueError: operands could not be broadcast together with shapes (90689,) (91354,) 

## 3. Automated Hypothesis Testing

In [248]:
import re

# ## A List of parameters
param_names = bestIndepResult.params.index.tolist()

# ## Create joint hypothesis tests for "No interactions"
interaction_pattern = re.compile(r"RwAGEM_B:C\(RwJOCCSD,")
interaction_indices = [i for i, name in enumerate(param_names) if interaction_pattern.search(name)]
interaction_matrix = np.zeros((len(interaction_indices), len(param_names)))
for row, idx in enumerate(interaction_indices):
    interaction_matrix[row, idx] = 1

## Create joint hypothesis tests for "No main effect"
main_pattern = re.compile(r"^C\(RwJOCCSD,")
main_indices = [i for i, name in enumerate(param_names) if main_pattern.search(name)]
main_matrix = np.zeros((len(main_indices), len(param_names)))
for row, idx in enumerate(main_indices):
    main_matrix[row, idx] = 1

# ## Create the hypothesis test for STEM superior
# stem_matrix = np.zeros((1, len(param_names)))
# hypo_coef = [-1, -1, -1, -1, -1, 7, -1, -1]
# for j in list(range(0, len(main_indices))):
#     stem_matrix[0, main_indices[j]] = hypo_coef[j]

# ## Create the hypothesis test for Management superior
# mgmt_matrix = np.zeros((1, len(param_names)))
# hypo_coef = [-1, -1, -1, -1, 7, -1, -1, -1]
# for j in list(range(0, len(main_indices))):
#     mgmt_matrix[0, main_indices[j]] = hypo_coef[j]


# ## Create the hypothesis test for food inferior
# food_matrix = np.zeros((1, len(param_names)))
# hypo_coef = [-1, 7, -1, -1, -1, -1, -1, -1]
# for j in list(range(0, len(main_indices))):
#     food_matrix[0, main_indices[j]] = hypo_coef[j]

# ## Create the hypothesis test for farm inferior
# farm_matrix = np.zeros((1, len(param_names)))
# hypo_coef = [7, -1, -1, -1, -1, -1, -1, -1]
# for j in list(range(0, len(main_indices))):
#     farm_matrix[0, main_indices[j]] = hypo_coef[j]


In [240]:
def testSummary(r_matrix, model):
    wald_res = model.wald_test(r_matrix)
    print("Statistic:", wald_res.statistic[0,0])
    print("Degrees of freedom:", wald_res.df_denom)
    print("p-value:", wald_res.pvalue)
    print("Distribution:", wald_res.distribution)

def serialTest(model, varPattern):
    '''
    Test multipe similar null hypothesis independently
    '''
    param_names = model.params.index.tolist()

    # Indices of interested parameters
    main_pattern = re.compile(varPattern)
    main_indices = [i for i, name in enumerate(param_names) if main_pattern.search(name)]

    l = len(main_indices)
    for j in list(range(0, l)):
        ## Hypothesis Coefficients
        hypo_coef = np.full((1, l), -1/l)
        hypo_coef[0, j] = 1

        ## Map hypo_coef to the parameter
        r_matrix = np.zeros((1, len(param_names)))

        # print(hypo_coef[0,1])
        # print(main_indices)
        # print(r_matrix)
        for k in list(range(0, len(main_indices))):
            r_matrix[0, main_indices[k]] = hypo_coef[0,k]

        ## Net effect estimate
        netEffect = (r_matrix @ np.array(model.params))[0]
        print(f"Net effect of {param_names[main_indices[j]]} is {netEffect}")
        
        ## Wald test 
        testSummary(r_matrix, model)
        print("\n\n")

    
# testSummary(interaction_matrix, bestARResult)
serialTest(bestIndepResult, "^C\(RwJOCCSD,")

Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.Farming/Forestry/Fishing] is -0.06894547102285697
Statistic: 0.06263578141075488
Degrees of freedom: 1.0
p-value: 0.8023774603258461
Distribution: chi2



Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.Food/Personal/Service] is -0.2883374708060271
Statistic: 4.221754213518139
Degrees of freedom: 1.0
p-value: 0.039908880767751106
Distribution: chi2



Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.Healthcare] is -0.6302088552187908
Statistic: 5.492568548303178
Degrees of freedom: 1.0
p-value: 0.019097466494387223
Distribution: chi2



Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations] is 0.4482378627486824
Statistic: 2.1270050809326753
Degrees of freedom: 1.0
p-value: 0.1447232827842026
Distribution: chi2



Net effect of C(RwJOCCSD, Treatment(reference='Retired'))[T.Management/Clerical/Business] is -0.19136888896300053
Statistic: 2.841649852858129
Degrees of freedom: 1.



In [251]:
print("Interaction Effect")
testSummary(interaction_matrix, bestIndepResult)

print("Main Effect")
testSummary(main_matrix, bestIndepResult)

Interaction Effect
Statistic: 183.24279260056883
Degrees of freedom: 8.0
p-value: 2.1452051145196688e-35
Distribution: chi2
Main Effect
Statistic: 162.99028452218164
Degrees of freedom: 8.0
p-value: 3.788229157868425e-31
Distribution: chi2




In [252]:
bestIndepResult.qic()



(8608.505787762413, 8656.393893894128)

In [None]:
## Multiple tests on Interaction Terms
serialTest(bestARResult, "^RwAGEM_B:C\(RwJOCCSD,")

Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Farming/Forestry/Fishing] is 0.035807682949691555
Statistic: 1.1138568383756848
Degrees of freedom: 1.0
p-value: 0.29124509153381184
Distribution: chi2



Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Food/Personal/Service] is -0.006149995862626768
Statistic: 0.23417399573963538
Degrees of freedom: 1.0
p-value: 0.6284459975690349
Distribution: chi2



Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Healthcare] is 0.008672970360212976
Statistic: 0.2950677957107403
Degrees of freedom: 1.0
p-value: 0.5869911207533771
Distribution: chi2



Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.High Risk Occupations] is -0.05018521275236207
Statistic: 1.1111731732317598
Degrees of freedom: 1.0
p-value: 0.29182706885164766
Distribution: chi2



Net effect of RwAGEM_B:C(RwJOCCSD, Treatment(reference='Retired'))[T.Management/Clerical/Business] is 0.0030246022191070523


In [None]:
# Retrieve the covariance matrices of the fitted GEE models
# cov_exch = resultsExch.cov_params()
# cov_ar = resultsAR.cov_params()
# cov_unstruct = resultsUnstruct.cov_params()

# print("Covariance matrix (Exchangeable):\n", cov_exch)
# print("\nCovariance matrix (Autoregressive):\n", cov_ar)
# print("\nCovariance matrix (Unstructured):\n", cov_unstruct)