In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from matplotlib import style
from matplotlib import pyplot as plt
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col


from linearmodels.panel import PanelOLS

In [2]:
from linearmodels.datasets import wage_panel
data = wage_panel.load()
data.head()

Unnamed: 0,nr,year,black,exper,hisp,hours,married,educ,union,lwage,expersq,occupation
0,13,1980,0,1,0,2672,0,14,0,1.19754,1,9
1,13,1981,0,2,0,2320,0,14,1,1.85306,4,9
2,13,1982,0,3,0,2940,0,14,0,1.344462,9,9
3,13,1983,0,4,0,2960,0,14,0,1.433213,16,9
4,13,1984,0,5,0,3071,0,14,0,1.568125,25,5


In [3]:
data.shape

(4360, 12)

In [4]:
data.set_index(["nr", "year"])

Unnamed: 0_level_0,Unnamed: 1_level_0,black,exper,hisp,hours,married,educ,union,lwage,expersq,occupation
nr,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
13,1980,0,1,0,2672,0,14,0,1.197540,1,9
13,1981,0,2,0,2320,0,14,1,1.853060,4,9
13,1982,0,3,0,2940,0,14,0,1.344462,9,9
13,1983,0,4,0,2960,0,14,0,1.433213,16,9
13,1984,0,5,0,3071,0,14,0,1.568125,25,5
...,...,...,...,...,...,...,...,...,...,...,...
12548,1983,0,8,0,2080,1,9,0,1.591879,64,5
12548,1984,0,9,0,2080,1,9,1,1.212543,81,5
12548,1985,0,10,0,2080,1,9,0,1.765962,100,5
12548,1986,0,11,0,2080,1,9,1,1.745894,121,5


In [5]:
# number of indiviuals
len(data.nr.unique())

545

# Methods
Three ways:
1. demean data and run simple ols
2. use dummy for each group and run regression
3. Use library linearmodels

In [6]:
Y = "lwage"
T = "married"
X = [T, "expersq", "union", "hours", "exper"]

mean_data = data.groupby("nr")[X+[Y]].mean()
mean_data.head()

Unnamed: 0_level_0,married,expersq,union,hours,exper,lwage
nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,0.0,25.5,0.125,2807.625,4.5,1.255652
17,0.0,61.5,0.0,2504.125,7.5,1.637786
18,1.0,61.5,0.0,2350.5,7.5,2.034387
45,0.125,35.5,0.25,2225.875,5.5,1.773664
110,0.5,77.5,0.125,2108.0,8.5,2.055129


In [7]:
demeaned_data = (data
               .set_index("nr") # set the index as the person indicator
               [X+[Y]]
               - mean_data) # subtract the mean data

demeaned_data.head()

Unnamed: 0_level_0,married,expersq,union,hours,exper,lwage
nr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,0.0,-24.5,-0.125,-135.625,-3.5,-0.058112
13,0.0,-21.5,0.875,-487.625,-2.5,0.597408
13,0.0,-16.5,-0.125,132.375,-1.5,0.08881
13,0.0,-9.5,-0.125,152.375,-0.5,0.177561
13,0.0,-0.5,-0.125,263.375,0.5,0.312473


In [8]:
# regression formula
xx = f"{Y} ~ {'+'.join(X)}"

mod = smf.ols(xx, data=demeaned_data
            ).fit(cov_type='cluster', 
            cov_kwds={'groups': demeaned_data.index})
mod.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.249e-16,2.66e-17,4.700,0.000,7.28e-17,1.77e-16
married,0.0463,0.021,2.186,0.029,0.005,0.088
expersq,-0.0053,0.001,-7.604,0.000,-0.007,-0.004
union,0.0751,0.022,3.353,0.001,0.031,0.119
hours,-0.0001,2.13e-05,-6.330,0.000,-0.000,-9.29e-05
exper,0.1370,0.011,12.574,0.000,0.116,0.158


In [9]:
# using dummies for each individual, we get the same coefficients
# effectively fixed effects model is estimating the panel regression with
# dummies for each individual or group

mean_model = smf.ols("lwage ~ married + expersq + exper + union +hours + C(nr) -1",
                    data = data).fit()
# wth fixed time effcts and fixed group effects
mean_model_t = smf.ols("lwage ~ married + expersq + union +hours + C(nr)+C(year)",
                    data = data).fit()

# summary_col(results=[mean_model,mean_model_t,mod],
summary_col(results=[mean_model,mod],
            model_names= ["wth dummy","Demeaned"],
            regressor_order = ["union", "married", "exper", "expersq", "hours"] + ["Intercept"],
            drop_omitted=  True)

## Why intercept is not the same???
# mean_model.summary()

0,1,2
,wth dummy,Demeaned
union,0.0751,0.0751
,(0.0191),(0.0224)
married,0.0463,0.0463
,(0.0181),(0.0212)
exper,0.1370,0.1370
,(0.0085),(0.0109)
expersq,-0.0053,-0.0053
,(0.0006),(0.0007)
hours,-0.0001,-0.0001
