In [1]:
import numpy as np
import pandas as pd
from statsmodels.formula.api import ols
import wooldridge

In [2]:
wage1 = wooldridge.data('wage1')
wooldridge.data('wage1',description=True)

name of dataset: wage1
no of variables: 24
no of observations: 526

+----------+---------------------------------+
| variable | label                           |
+----------+---------------------------------+
| wage     | average hourly earnings         |
| educ     | years of education              |
| exper    | years potential experience      |
| tenure   | years with current employer     |
| nonwhite | =1 if nonwhite                  |
| female   | =1 if female                    |
| married  | =1 if married                   |
| numdep   | number of dependents            |
| smsa     | =1 if live in SMSA              |
| northcen | =1 if live in north central U.S |
| south    | =1 if live in southern region   |
| west     | =1 if live in western region    |
| construc | =1 if work in construc. indus.  |
| ndurman  | =1 if in nondur. manuf. indus.  |
| trcommpu | =1 if in trans, commun, pub ut  |
| trade    | =1 if in wholesale or retail    |
| services | =1 if in services indus.  

In [3]:
form_const = 'wage ~ 1'  # 定数項だけの場合は１が必要

res_const = ols(form_const, data=wage1).fit()

res_const.params

Intercept    5.896103
dtype: float64

In [4]:
wage1.loc[:,'wage'].mean()

5.896102674787035

In [5]:
form_const_2 = 'wage ~ female'

res_const_2 = ols(form_const_2, data=wage1).fit()

res_const_2.params

Intercept    7.099489
female      -2.511830
dtype: float64

In [6]:
# 女性だけを抽出するTrue/False条件の作成
cond_female = (wage1['female']==1)

wage1.loc[cond_female,'wage'].mean()

4.587658740225292

In [7]:
# 男性だけを抽出するTrue/False条件の作成
cond_male = (wage1['female']==0)

wage1.loc[cond_male,'wage'].mean()

7.099489067157689

In [8]:
# 以下では row をDataFrameの行と考える。

# 未婚男性の関数
def singmale(row):
    if row['female'] == 0 and row['married'] == 0:
        return 1
    else:
        return 0

# 既婚男性の関数
def marmale(row):
    if row['female'] == 0 and row['married'] == 1:
        return 1
    else:
        return 0

# 未婚女性の関数
def singfem(row):
    if row['female'] == 1 and row['married'] == 0:
        return 1
    else:
        return 0

# 既婚女性の関数
def marfem(row):
    if row['female'] == 1 and row['married'] == 1:
        return 1
    else:
        return 0

In [9]:
wage1.loc[:,'singmale'] = wage1.apply(singmale, axis=1)
wage1.loc[:,'marmale'] = wage1.apply(marmale, axis=1)
wage1.loc[:,'singfem'] = wage1.apply(singfem, axis='columns')
wage1.loc[:,'marfem'] = wage1.apply(marfem, axis='columns')

wage1.head(3)

Unnamed: 0,wage,educ,exper,tenure,nonwhite,female,married,numdep,smsa,northcen,...,profocc,clerocc,servocc,lwage,expersq,tenursq,singmale,marmale,singfem,marfem
0,3.1,11,2,0,0,1,0,2,1,0,...,0,0,0,1.131402,4,0,0,0,1,0
1,3.24,12,22,2,0,1,1,3,1,0,...,0,0,1,1.175573,484,4,0,0,0,1
2,3.0,11,2,0,0,0,0,2,0,0,...,0,0,0,1.098612,4,0,1,0,0,0


In [10]:
func_dict = {'singmale':singmale,
             'marmale':marmale,
             'singfem':singfem,
             'marfem':marfem}

In [11]:
for key in func_dict:
    wage1.loc[:,key] = wage1.apply(func_dict[key], axis=1)

wage1.head(3)

Unnamed: 0,wage,educ,exper,tenure,nonwhite,female,married,numdep,smsa,northcen,...,profocc,clerocc,servocc,lwage,expersq,tenursq,singmale,marmale,singfem,marfem
0,3.1,11,2,0,0,1,0,2,1,0,...,0,0,0,1.131402,4,0,0,0,1,0
1,3.24,12,22,2,0,1,1,3,1,0,...,0,0,1,1.175573,484,4,0,0,0,1
2,3.0,11,2,0,0,0,0,2,0,0,...,0,0,0,1.098612,4,0,1,0,0,0


In [12]:
form_const_4 = 'wage ~ marmale + singfem + marfem'

res_const_4 = ols(form_const_4, data=wage1).fit()

para4 = res_const_4.params
para4

Intercept    5.168023
marmale      2.815009
singfem     -0.556440
marfem      -0.602114
dtype: float64

In [13]:
wage1.query('female==0 & married==0')['wage'].mean()

5.168023282705351

In [14]:
para4[0]

5.168023282705353

In [15]:
wage1.query('female==0 & married==1')['wage'].mean()

7.983031926002909

In [16]:
para4[0]+para4[1]

7.983031926002912

In [17]:
wage1.query('female==1 & married==0')['wage'].mean()

4.611583345135053

In [18]:
para4[0]+para4[2]

4.611583345135056

In [19]:
wage1.query('female==1 & married==1')['wage'].mean()

4.565909099398238

In [20]:
para4[0]+para4[3]

4.5659090993982385

In [21]:
form_1 = 'wage ~ female + educ + exper+ tenure'

res_1 = ols(form_1, data=wage1).fit()

res_1.params

Intercept   -1.567939
female      -1.810852
educ         0.571505
exper        0.025396
tenure       0.141005
dtype: float64

In [22]:
form_2 = 'np.log(wage) ~ female + female:educ + educ + exper + tenure'

res_2 = ols(form_2, data=wage1).fit()

In [23]:
print(res_2.summary().tables[1])

                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept       0.4647      0.123      3.781      0.000       0.223       0.706
female         -0.2104      0.174     -1.209      0.227      -0.552       0.131
female:educ    -0.0072      0.014     -0.534      0.593      -0.034       0.019
educ            0.0903      0.009     10.359      0.000       0.073       0.107
exper           0.0046      0.002      2.850      0.005       0.001       0.008
tenure          0.0174      0.003      5.849      0.000       0.012       0.023


In [24]:
hypotheses = 'female=0, female:educ=0'

res_2.f_test(hypotheses).pvalue

array(3.89455469e-14)

In [25]:
df = wage1.loc[:,['wage', 'female', 'educ']]

In [26]:
df.loc[:,'sex'] = df['female'].replace({1:'female',0:'male'})

In [27]:
df.head(3)

Unnamed: 0,wage,female,educ,sex
0,3.1,1,11,female
1,3.24,1,12,female
2,3.0,0,11,male


In [28]:
form_c = 'wage ~  C(sex) + educ'

res_c = ols(form_c, data=df).fit()

res_c.params

Intercept        -1.650545
C(sex)[T.male]    2.273362
educ              0.506452
dtype: float64

In [29]:
form_cm = 'wage ~  C(sex,Treatment("male")) + educ'

res_cm = ols(form_cm, data=df).fit()

res_cm.params

Intercept                              0.622817
C(sex, Treatment("male"))[T.female]   -2.273362
educ                                   0.506452
dtype: float64

In [30]:
form_ca = 'wage ~  female + educ'

res_ca = ols(form_ca, data=df).fit()

res_ca.params

Intercept    0.622817
female      -2.273362
educ         0.506452
dtype: float64

In [31]:
discrim = wooldridge.data('discrim')
wooldridge.data('discrim',description=True)

name of dataset: discrim
no of variables: 37
no of observations: 410

+----------+----------------------------------------------+
| variable | label                                        |
+----------+----------------------------------------------+
| psoda    | price of medium soda, 1st wave               |
| pfries   | price of small fries, 1st wave               |
| pentree  | price entree (burger or chicken), 1st wave   |
| wagest   | starting wage, 1st wave                      |
| nmgrs    | number of managers, 1st wave                 |
| nregs    | number of registers, 1st wave                |
| hrsopen  | hours open, 1st wave                         |
| emp      | number of employees, 1st wave                |
| psoda2   | price of medium soday, 2nd wave              |
| pfries2  | price of small fries, 2nd wave               |
| pentree2 | price entree, 2nd wave                       |
| wagest2  | starting wage, 2nd wave                      |
| nmgrs2   | number of manager

In [32]:
discrim['chain'].value_counts()

1    171
3     99
2     80
4     60
Name: chain, dtype: int64

In [33]:
form_p = 'np.log(pfries) ~ prpblck + np.log(income) + C(chain)'

res_p = ols(form_p, data=discrim).fit()

print(res_p.summary().tables[1])

                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept         -1.0812      0.222     -4.860      0.000      -1.519      -0.644
C(chain)[T.2]     -0.0682      0.014     -4.943      0.000      -0.095      -0.041
C(chain)[T.3]      0.0811      0.013      6.215      0.000       0.055       0.107
C(chain)[T.4]     -0.0434      0.015     -2.892      0.004      -0.073      -0.014
prpblck            0.1317      0.031      4.185      0.000       0.070       0.194
np.log(income)     0.0914      0.021      4.441      0.000       0.051       0.132


In [34]:
df_c = discrim.loc[:,['pfries', 'prpblck', 'income', 'chain']]
df_c.head()

Unnamed: 0,pfries,prpblck,income,chain
0,1.06,0.171154,44534.0,3
1,0.91,0.171154,44534.0,1
2,0.91,0.04736,41164.0,1
3,1.02,0.052839,50366.0,3
4,,0.03448,72287.0,1


In [35]:
df_c.loc[:,'chain'] = df_c['chain'].replace(
                        {1:'Berger_King',2:'KFC',3:'Roy_Rogers',4:'Wendys'})

df_c.head()

Unnamed: 0,pfries,prpblck,income,chain
0,1.06,0.171154,44534.0,Roy_Rogers
1,0.91,0.171154,44534.0,Berger_King
2,0.91,0.04736,41164.0,Berger_King
3,1.02,0.052839,50366.0,Roy_Rogers
4,,0.03448,72287.0,Berger_King


In [36]:
df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   pfries   393 non-null    float64
 1   prpblck  409 non-null    float64
 2   income   409 non-null    float64
 3   chain    410 non-null    object 
dtypes: float64(3), object(1)
memory usage: 12.9+ KB


In [37]:
df_c['chain'] = pd.Categorical(df_c['chain'])
df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   pfries   393 non-null    float64 
 1   prpblck  409 non-null    float64 
 2   income   409 non-null    float64 
 3   chain    410 non-null    category
dtypes: category(1), float64(3)
memory usage: 10.3 KB


In [38]:
form_c = 'np.log(pfries) ~ prpblck + np.log(income) + chain'

res_c = ols(form_c, data=df_c).fit()

print(res_c.summary().tables[1])

                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              -1.0812      0.222     -4.860      0.000      -1.519      -0.644
chain[T.KFC]           -0.0682      0.014     -4.943      0.000      -0.095      -0.041
chain[T.Roy_Rogers]     0.0811      0.013      6.215      0.000       0.055       0.107
chain[T.Wendys]        -0.0434      0.015     -2.892      0.004      -0.073      -0.014
prpblck                 0.1317      0.031      4.185      0.000       0.070       0.194
np.log(income)          0.0914      0.021      4.441      0.000       0.051       0.132
