In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
da = pd.read_csv('nhanes_2015_2016.csv')

In [5]:
vars = ['SMQ020', 'RIAGENDR', 'RIDAGEYR','RIDRETH1', 'DMDEDUC2', 'BPXSY1', 'BPXDI1', 'BMXBMI', 'BMXLEG', 'BMXARML', 'BMXARMC',
       'BMXWAIST','SDMVPSU', 'SDMVSTRA' ]

In [6]:
da = da[vars].dropna()

In [7]:
da['group'] = 10*da.SDMVSTRA + da.SDMVPSU

In [11]:
model = sm.GEE.from_formula('BPXSY1 ~ 1', groups='group',
            cov_struct=sm.cov_struct.Exchangeable(), data=da)
result= model.fit()
print(result.cov_struct.summary())

The correlation between two observations in the same cluster is 0.030


In [13]:
da['smq'] = da.SMQ020.replace({2: 0, 7: np.nan, 9: np.nan})
for v in ['BPXSY1', 'RIDAGEYR', 'BMXBMI', 'smq', 'SDMVSTRA']:
    model= sm.GEE.from_formula(v + '~1', groups='group',
            cov_struct = sm.cov_struct.Exchangeable(), data=da)
    result= model.fit()
    print(v, result.cov_struct.summary())

BPXSY1 The correlation between two observations in the same cluster is 0.030
RIDAGEYR The correlation between two observations in the same cluster is 0.032
BMXBMI The correlation between two observations in the same cluster is 0.041
smq The correlation between two observations in the same cluster is 0.028
SDMVSTRA The correlation between two observations in the same cluster is 0.956


In [18]:
for k in range(10):
    da["noise"] = np.random.normal(size=da.shape[0])
    model = sm.GEE.from_formula("noise ~ 1", groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
    result = model.fit()
    print(v, result.cov_struct.summary())

SDMVSTRA The correlation between two observations in the same cluster is 0.003
SDMVSTRA The correlation between two observations in the same cluster is -0.002
SDMVSTRA The correlation between two observations in the same cluster is -0.002
SDMVSTRA The correlation between two observations in the same cluster is 0.000
SDMVSTRA The correlation between two observations in the same cluster is -0.001
SDMVSTRA The correlation between two observations in the same cluster is -0.001
SDMVSTRA The correlation between two observations in the same cluster is 0.000
SDMVSTRA The correlation between two observations in the same cluster is 0.002
SDMVSTRA The correlation between two observations in the same cluster is -0.003
SDMVSTRA The correlation between two observations in the same cluster is -0.001


In [19]:
model = sm.GEE.from_formula("BPXSY1 ~ RIDAGEYR", groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
result = model.fit()
print(result.cov_struct.summary())

The correlation between two observations in the same cluster is 0.020


In [22]:
# Create a labeled version of the gender variable
da["RIAGENDRx"] = da.RIAGENDR.replace({1: "Male", 2: "Female"})

model = sm.GEE.from_formula("BPXSY1 ~ RIDAGEYR + RIAGENDRx + BMXBMI + C(RIDRETH1)",
           groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
result = model.fit()
print(result.cov_struct.summary())

The correlation between two observations in the same cluster is 0.014


In [23]:
# Fit a linear model with OLS
model1 = sm.OLS.from_formula("BPXSY1 ~ RIDAGEYR + RIAGENDRx + BMXBMI + C(RIDRETH1)",
           data=da)
result1 = model1.fit()

# Fit a marginal linear model using GEE to handle dependent data
model2 = sm.GEE.from_formula("BPXSY1 ~ RIDAGEYR + RIAGENDRx + BMXBMI + C(RIDRETH1)",
           groups="group",
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
result2 = model2.fit()

x = pd.DataFrame({"OLS_params": result1.params, "OLS_SE": result1.bse,
                  "GEE_params": result2.params, "GEE_SE": result2.bse})
x = x[["OLS_params", "OLS_SE", "GEE_params", "GEE_SE"]]
print(x)

                   OLS_params    OLS_SE  GEE_params    GEE_SE
Intercept           90.865830  1.396757   91.211151  1.479329
RIAGENDRx[T.Male]    3.774323  0.464582    3.735351  0.427721
C(RIDRETH1)[T.2]     0.886011  0.835939    0.240542  0.797838
C(RIDRETH1)[T.3]    -1.890103  0.685896   -2.343475  0.836188
C(RIDRETH1)[T.4]     3.699450  0.750568    2.965434  0.911602
C(RIDRETH1)[T.5]    -0.545645  0.830523   -0.488103  0.870573
RIDAGEYR             0.477050  0.013367    0.473059  0.016877
BMXBMI               0.311438  0.035342    0.315545  0.039424


In [24]:
# Relabel the levels, convert rare categories to missing.
da["DMDEDUC2x"] = da.DMDEDUC2.replace({1: "lt9", 2: "x9_11", 3: "HS", 4: "SomeCollege",
                                       5: "College", 7: np.nan, 9: np.nan})

# Fit a basic GLM
model1 = sm.GLM.from_formula("smq ~ RIDAGEYR + RIAGENDRx + C(DMDEDUC2x)",
           family=sm.families.Binomial(), data=da)
result1 = model1.fit()
result1.summary()

# Fit a marginal GLM using GEE
model2 = sm.GEE.from_formula("smq ~ RIDAGEYR + RIAGENDRx + C(DMDEDUC2x)",
           groups="group", family=sm.families.Binomial(),
           cov_struct=sm.cov_struct.Exchangeable(), data=da)
result2 = model2.fit(start_params=result1.params)

x = pd.DataFrame({"OLS_params": result1.params, "OLS_SE": result1.bse,
                  "GEE_params": result2.params, "GEE_SE": result2.bse})
x = x[["OLS_params", "OLS_SE", "GEE_params", "GEE_SE"]]
print(x)

                             OLS_params    OLS_SE  GEE_params    GEE_SE
Intercept                     -2.291235  0.117584   -2.227394  0.147929
RIAGENDRx[T.Male]              0.873545  0.061972    0.869176  0.065042
C(DMDEDUC2x)[T.HS]             0.977012  0.092122    0.918804  0.095543
C(DMDEDUC2x)[T.SomeCollege]    0.833761  0.086496    0.768989  0.106517
C(DMDEDUC2x)[T.lt9]            0.242440  0.114149    0.302279  0.129860
C(DMDEDUC2x)[T.x9_11]          1.126384  0.109774    1.089043  0.142097
RIDAGEYR                       0.018186  0.001796    0.017318  0.001875


In [4]:
da.columns

Index(['SEQN', 'ALQ101', 'ALQ110', 'ALQ130', 'SMQ020', 'RIAGENDR', 'RIDAGEYR',
       'RIDRETH1', 'DMDCITZN', 'DMDEDUC2', 'DMDMARTL', 'DMDHHSIZ', 'WTINT2YR',
       'SDMVPSU', 'SDMVSTRA', 'INDFMPIR', 'BPXSY1', 'BPXDI1', 'BPXSY2',
       'BPXDI2', 'BMXWT', 'BMXHT', 'BMXBMI', 'BMXLEG', 'BMXARML', 'BMXARMC',
       'BMXWAIST', 'HIQ210'],
      dtype='object')