In [1]:
import os

os.getcwd()

'c:\\Users\\86139\\Desktop\\PARA Note System\\Projects\\Inter-University Health Data\\2025-Inter-Univer-Health-Data\\Code'

In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

hrsWave = pd.read_csv("../Data/hrsWaveCleaned.csv")

n = 1000
hhidpn = np.random.choice(hrsWave["HHIDPN"].unique(), size = n)
idx = hrsWave["HHIDPN"].isin(hhidpn)
df = hrsWave.loc[idx, :]

In [12]:
# Assume your data is in a pandas DataFrame called 'df'
# with columns: 'recall_score' (your outcome), 'age', 'job', 'HHIDPN' (subject ID)

# 1. Define the Binomial family and logit link
# The `Binomial` family in statsmodels assumes endog is proportions (e.g., successes/n_trials)
# or a 2-column array where col 0 is successes and col 1 is failures.
# Since your score is 0-20, you should pass it as a two-column array: [recall_score, 20 - recall_score]

n_trials = 20
df['RwRecFail'] = n_trials - df['RwTR20']

# Create a 2-column array for endog if using the formula API with a non-standard endog
# For GEE with Binomial, endog usually expects a proportion (successes/n_trials) or a (successes, total_trials) tuple/array
# If using `smf.gee`, it's often more straightforward to define `endog` as proportion.
df['RwRecProp'] = df['RwTR20'] / n_trials

# 2. Define the exchangeable correlation structure
exchangeable_corr = sm.cov_struct.Exchangeable()
autoregress_corr = sm.cov_struct.Autoregressive()
unstructure_corr = sm.cov_struct.CovStruct()


# 3. Specify Full Model Formula
formulaFull = "RwRecProp ~ RwAGEM_B * C(RwJOCCSD, Treatment(reference='Retired'))"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RwRecFail'] = n_trials - df['RwTR20']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RwRecProp'] = df['RwTR20'] / n_trials


In [13]:
# 3. Fit the GEE model using the formula API with robust sandwich covariance matrix
modelExch = smf.gee(formulaFull, groups="HHIDPN", data=df,
                cov_struct=exchangeable_corr,
                family=sm.families.Binomial())

modelAR = smf.gee(formulaFull, groups="HHIDPN", data=df,
                cov_struct=autoregress_corr,
                family=sm.families.Binomial())

modelUnstruct = smf.gee(formulaFull, groups="HHIDPN", data=df,
                cov_struct=unstructure_corr,
                family=sm.families.Binomial())

# Fit the model with robust covariance
resultsExch = modelExch.fit(cov_type="robust")
resultsAR = modelAR.fit(cov_type="robust")
resultsUnstruct = modelUnstruct.fit(cov_type="robust")

NotImplementedError: 

In [7]:
import re
import numpy as np

def stepwise_selection_qic_forward(data, groups, cov_struct, family, start_formula, end_formula, verbose=True, qic_threshold=0):
    """
    Perform forward stepwise feature selection based on QIC for GEE models.

    Parameters:
        data: pandas DataFrame
        groups: str, column name for group/cluster
        cov_struct: statsmodels covariance structure object
        family: statsmodels family object (e.g., sm.families.Binomial())
        start_formula: str, starting model formula (e.g., "y ~ 1")
        end_formula: str, full model formula (e.g., "y ~ x1 + x2 + x3")
        verbose: bool, print progress
        qic_threshold: float, minimum QIC reduction required to add a variable (selection threshold)

    Returns:
        best_formula: str, formula of the best model found
        best_result: fitted GEE result object
        history: list of (formula, QIC)
    """
    import statsmodels.formula.api as smf

    def get_terms(formula):
        rhs = formula.split('~')[1]
        terms = [t.strip() for t in re.split(r'\s*\+\s*', rhs) if t.strip() != '']
        terms = [t for t in terms if t != '1']
        return set(terms)

    def build_formula(lhs, terms):
        if not terms:
            return f"{lhs} ~ 1"
        return f"{lhs} ~ {' + '.join(sorted(terms))}"

    def calc_qic(result):
        try:
            return result.qic()
        except Exception:
            return np.nan

    lhs = start_formula.split('~')[0].strip()
    start_terms = get_terms(start_formula)
    end_terms = get_terms(end_formula)
    current_terms = set(start_terms)
    history = []

    # Fit initial model
    current_formula = build_formula(lhs, current_terms)
    model = smf.gee(current_formula, groups=groups, data=data, cov_struct=cov_struct, family=family)
    result = model.fit(cov_type="robust")
    best_qic = calc_qic(result)
    best_formula = current_formula
    best_result = result
    history.append((current_formula, best_qic))

    improved = True
    while improved:
        improved = False
        qic_candidates = []
        formulas = []
        term_changes = []

        for term in sorted(end_terms - current_terms):
            new_terms = current_terms | {term}
            formula = build_formula(lhs, new_terms)
            try:
                model = smf.gee(formula, groups=groups, data=data, cov_struct=cov_struct, family=family)
                result = model.fit(cov_type="robust")
                qic = calc_qic(result)
            except Exception:
                qic = np.nan
            qic_candidates.append(qic)
            formulas.append(formula)
            term_changes.append(term)

        if qic_candidates:
            min_idx = np.nanargmin(qic_candidates)
            min_qic = qic_candidates[min_idx]
            # Selection threshold is checked here:
            if (best_qic - min_qic) > qic_threshold:
                improved = True
                best_qic = min_qic
                best_formula = formulas[min_idx]
                current_terms.add(term_changes[min_idx])
                model = smf.gee(best_formula, groups=groups, data=data, cov_struct=cov_struct, family=family)
                best_result = model.fit(cov_type="robust")
                history.append((best_formula, best_qic))
                if verbose:
                    print(f"Step: forward, QIC: {best_qic:.2f}, Formula: {best_formula}")
            else:
                if verbose:
                    print("No QIC improvement above threshold, stopping.")
        else:
            if verbose:
                print("No candidates left, stopping.")

    return best_formula, best_result, history


NameError: name 'summary' is not defined

In [None]:
# Calculate QIC for each model as a measure of goodness of fit
def calc_qic(model_result):
    # QIC = -2 * quasi-likelihood + 2 * trace(V_hat^-1 * V_model)
    # statsmodels does not provide QIC directly, but we can use qic() if available
    try:
        return model_result.qic()
    except AttributeError:
        return np.nan

qic_exch = calc_qic(resultsExch)
qic_ar = calc_qic(resultsAR)
qic_unstruct = calc_qic(resultsUnstruct)

print(f"QIC (Exchangeable): {qic_exch}")
print(f"QIC (Autoregressive): {qic_ar}")
print(f"QIC (Unstructured): {qic_unstruct}")

In [None]:
# Retrieve the covariance matrices of the fitted GEE models
cov_exch = resultsExch.cov_params()
cov_ar = resultsAR.cov_params()
cov_unstruct = resultsUnstruct.cov_params()

print("Covariance matrix (Exchangeable):\n", cov_exch)
print("\nCovariance matrix (Autoregressive):\n", cov_ar)
print("\nCovariance matrix (Unstructured):\n", cov_unstruct)