### Partialling Out

#### Imports and loading

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
import sys
from sklearn.base import BaseEstimator
import warnings
# ignore potential convergence warnings; for some small
# penalty levels, tried out, optimization might not converge
warnings.simplefilter('ignore')

In [5]:
file = ("https://raw.githubusercontent.com/CausalAIBook/"
        "MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv")
df = pd.read_csv(file)
df.head()

Unnamed: 0,wage,lwage,sex,shs,hsg,scl,clg,ad,mw,so,we,ne,exp1,exp2,exp3,exp4,occ,occ2,ind,ind2
0,9.615385,2.263364,1,0,0,0,1,0,0,0,0,1,7.0,0.49,0.343,0.2401,3600.0,11,8370.0,18
1,48.076923,3.872802,0,0,0,0,1,0,0,0,0,1,31.0,9.61,29.791,92.3521,3050.0,10,5070.0,9
2,11.057692,2.403126,0,0,1,0,0,0,0,0,0,1,18.0,3.24,5.832,10.4976,6260.0,19,770.0,4
3,13.942308,2.634928,1,0,0,0,0,1,0,0,0,1,25.0,6.25,15.625,39.0625,420.0,1,6990.0,12
4,28.846154,3.361977,1,0,0,0,1,0,0,0,0,1,22.0,4.84,10.648,23.4256,2015.0,6,9470.0,22


### Initial Examination of the Difference in wage between sub College and College grads

In [None]:
table = pd.DataFrame()

cols = ["lwage", "sex", "shs", "hsg", "scl",
        "clg", "ad", "ne", "mw", "so", "we", "exp1"]

table['Variable'] = ["Log Wage", "Sex", "Less then High School",
                     "High School Graduate", "Some College",
                     "Gollage Graduate", "Advanced Degree",
                     "Northeast", "Midwest", "South", "West", "Experience"]

table['College Grad +'] = df[(df['clg'] == 1) | (df['ad'] == 1)][cols].mean().values
table['Sub College Grad'] = df[~((df['clg'] == 1) | (df['ad'] == 1))][cols].mean().values


table

Unnamed: 0,Variable,College Grad +,Sub College Grad
0,Log Wage,3.19446,2.784233
1,Sex,0.510675,0.389245
2,Less then High School,0.0,0.042735
3,High School Graduate,0.0,0.447293
4,Some College,0.0,0.509972
5,Gollage Graduate,0.698548,0.0
6,Advanced Degree,0.301452,0.0
7,Northeast,0.255337,0.204772
8,Midwest,0.215628,0.296296
9,South,0.308284,0.286681


In [49]:
### Let's add college degree vs no college degree as a single column for easier regression work
df["clgE"] = ((df['clg'] == 1) | (df['ad'] == 1)).astype(int)
df.head()

Unnamed: 0,wage,lwage,sex,shs,hsg,scl,clg,ad,mw,so,...,exp1,exp2,exp3,exp4,occ,occ2,ind,ind2,clg+,clgE
0,9.615385,2.263364,1,0,0,0,1,0,0,0,...,7.0,0.49,0.343,0.2401,3600.0,11,8370.0,18,1,1
1,48.076923,3.872802,0,0,0,0,1,0,0,0,...,31.0,9.61,29.791,92.3521,3050.0,10,5070.0,9,1,1
2,11.057692,2.403126,0,0,1,0,0,0,0,0,...,18.0,3.24,5.832,10.4976,6260.0,19,770.0,4,0,0
3,13.942308,2.634928,1,0,0,0,0,1,0,0,...,25.0,6.25,15.625,39.0625,420.0,1,6990.0,12,1,1
4,28.846154,3.361977,1,0,0,0,1,0,0,0,...,22.0,4.84,10.648,23.4256,2015.0,6,9470.0,22,1,1


#### Fit OLS

In [53]:
### Fitting an OLS with controls
flex = "lwage ~ clgE + (exp1+exp2+exp3+exp4)*(C(occ2)+C(ind2)+mw+so+we)*(sex)"

control_fit = smf.ols(flex, data=df).fit()
control_est = control_fit.params['clgE']
control_se = control_fit.HC3_se['clgE']

print(control_fit.summary())
print()
print(f"The estimated College Degree coefficient is {control_est:.4f} "
      f"and the corresponding robust standard error is {control_se:.4f}")

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.369
Model:                            OLS   Adj. R-squared:                  0.309
Method:                 Least Squares   F-statistic:                     6.155
Date:                Thu, 05 Jun 2025   Prob (F-statistic):          1.32e-239
Time:                        20:47:16   Log-Likelihood:                -3231.7
No. Observations:                5150   AIC:                             7357.
Df Residuals:                    4703   BIC:                         1.028e+04
Df Model:                         446                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                  3

#### Partial out
First we fit a model of wage without observing the college.

In [57]:
### Fitting OLS on graduate data vs no graduate data
flex_no_CLG = "lwage ~ (exp1+exp2+exp3+exp4)*(C(occ2)+C(ind2)+mw+so+we)*(sex)"

lm0 = smf.ols(flex_no_CLG, data=df[df["clgE"] == 0])
lm1 = smf.ols(flex_no_CLG, data=df[df["clgE"] == 1])
XX0 = lm0.exog
y0 = lm0.endog
XX1 = lm1.exog
y1 = lm1.endog
# the coefficients excluding intercept and "sex"
betarest = control_fit.params[control_fit.params.index != "clgE"][1:]

print("The marginal gap:", y1.mean() - y0.mean())
print("This is the difference in the means between College+ educations and sub College educations")
print()
diff_unexplained = control_est
print("The unexplained difference: ", diff_unexplained)
print("This is the fraction of this difference that is unexplained by other factors and is covered by the College education")
print()
diff_explained = betarest.dot(XX1.mean(0)[1:] - XX0.mean(0)[1:])
print("The explained difference:", diff_explained)
print("This is the fraction that can be explained by other factors / doesn't need college degree seperation to explain")
print()
print("The sum of these differences:", diff_unexplained + diff_explained)

The marginal gap: 0.41022739525585816
This is the difference in the means between College+ educations and sub College educations

The unexplained difference:  0.3055942946727747
This is the fraction of this difference that is unexplained by other factors and is covered by the College education

The explained difference: 0.10463310058314512
This is the fraction that can be explained by other factors / doesn't need college degree seperation to explain

The sum of these differences: 0.41022739525591984
