In [None]:
'''
Introdução à Econometria - Uma abordagem moderna (Tradução da 6 edição norte-americana)
Autor: WOOLDRIDGE, J. M.
Editora: CENGAGE LEARNING

Cap. 6: Análise de regressão múltipla: problemas adicionais
Exercício 4, item (ii), reprodução da equação estimada.

O objetivo deste arquivo é apenas ilustrar a diferença entre usar ":" e "*" em modelos com termos de interação.
Por exemplo, o modelo abaixo inclui um termo de interação entre x2 e x3:
y = B0 + B1*x1 + B2*(x2*x3) + u


Arquivo com os dados: wage2.xls

Arquivo com dados em:
http://students.cengage.com.br/dashboard/private/livroView.jsf;jsessionid=95E9AD889A4A4B7ABBD2A5251F1E14BE?id=104577

Em caso de dúvidas ou problemas, solicitamos, por gentileza, entrar em contato pelo e-mail:
python.economia@gmail.com
'''

In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

In [2]:
df = pd.read_excel('wage2.xls',
                   header=None,
                   usecols=[4, 5, 6, 14, 15, 16],
                   names=['educ', 'exper', 'tenure', 'meduc', 'feduc', 'lwage' ])

In [3]:
df.head()

Unnamed: 0,educ,exper,tenure,meduc,feduc,lwage
0,12,11,2,8,8,6.645091
1,18,11,16,14,14,6.694562
2,14,11,9,14,14,6.715384
3,12,13,7,12,12,6.476973
4,11,14,5,6,11,6.331502


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   educ    935 non-null    int64  
 1   exper   935 non-null    int64  
 2   tenure  935 non-null    int64  
 3   meduc   935 non-null    object 
 4   feduc   935 non-null    object 
 5   lwage   935 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 44.0+ KB


In [5]:
df['meduc'].unique()

array([8, 14, 12, 6, 13, 16, 10, 11, 18, 17, 15, '.', 9, 7, 2, 5, 4, 3, 1,
       0], dtype=object)

In [6]:
df['feduc'].unique()

array([8, 14, 12, 11, '.', 5, 10, 16, 18, 7, 9, 15, 6, 13, 17, 4, 2, 3, 0],
      dtype=object)

In [7]:
df['meduc'].replace({'.': np.nan}, inplace=True)
df['feduc'].replace({'.': np.nan}, inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   educ    935 non-null    int64  
 1   exper   935 non-null    int64  
 2   tenure  935 non-null    int64  
 3   meduc   857 non-null    float64
 4   feduc   741 non-null    float64
 5   lwage   935 non-null    float64
dtypes: float64(3), int64(3)
memory usage: 44.0 KB


In [9]:
df['pareduc'] = df['meduc'] + df['feduc']

In [10]:
df.head()

Unnamed: 0,educ,exper,tenure,meduc,feduc,lwage,pareduc
0,12,11,2,8.0,8.0,6.645091,16.0
1,18,11,16,14.0,14.0,6.694562,28.0
2,14,11,9,14.0,14.0,6.715384,28.0
3,12,13,7,12.0,12.0,6.476973,24.0
4,11,14,5,6.0,11.0,6.331502,17.0


In [11]:
# Reprodução da equação estimada no item (ii)
modelo = smf.ols(formula='lwage ~ educ + educ:pareduc + exper + tenure', data=df)
reg = modelo.fit()
reg.summary()

0,1,2,3
Dep. Variable:,lwage,R-squared:,0.169
Model:,OLS,Adj. R-squared:,0.164
Method:,Least Squares,F-statistic:,36.44
Date:,"Wed, 29 Jun 2022",Prob (F-statistic):,9.44e-28
Time:,21:40:08,Log-Likelihood:,-329.77
No. Observations:,722,AIC:,669.5
Df Residuals:,717,BIC:,692.4
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.6465,0.130,43.582,0.000,5.392,5.901
educ,0.0468,0.010,4.462,0.000,0.026,0.067
educ:pareduc,0.0008,0.000,3.677,0.000,0.000,0.001
exper,0.0189,0.004,4.786,0.000,0.011,0.027
tenure,0.0102,0.003,3.413,0.001,0.004,0.016

0,1,2,3
Omnibus:,21.84,Durbin-Watson:,1.837
Prob(Omnibus):,0.0,Jarque-Bera (JB):,34.056
Skew:,-0.251,Prob(JB):,4.03e-08
Kurtosis:,3.939,Cond. No.,2850.0


### Nota: Veja o que ocorre se utilizamos * nos termos de interação


In [12]:
# Para ilustração, aqui retiramos a variável educ
modelo = smf.ols(formula='lwage ~ educ:pareduc + exper + tenure', data=df)  # uso de : nos termos de interação
reg = modelo.fit()
reg.params

Intercept       6.132853
educ:pareduc    0.001462
exper           0.013902
tenure          0.011225
dtype: float64

In [13]:
modelo = smf.ols(formula='lwage ~ educ*pareduc + exper + tenure', data=df)  # uso de * nos termos de interação
reg = modelo.fit()
reg.params

Intercept       4.937661
educ            0.097113
pareduc         0.033214
educ:pareduc   -0.001568
exper           0.019557
tenure          0.010308
dtype: float64

In [None]:
'''Note que quando utilizamos * no termo de interação as variáveis educ e pareduc foram incluídas individualmente mesmo não
aparecendo na formula passada para smf.ols()'''