In [7]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

df = pd.read_csv('./house_prices.csv')
df2 = df.copy()
df.head()

Unnamed: 0,house_id,neighborhood,area,bedrooms,bathrooms,style,price
0,1112,B,1188,3,2,ranch,598291
1,491,B,3512,5,3,victorian,1744259
2,5952,B,1134,3,2,ranch,571669
3,3525,A,1940,4,2,ranch,493675
4,5108,B,2208,6,4,victorian,1101539


In [8]:
## The below function creates 1, 0, -1 coded dummy variables.

def dummy_cat(df, col):
    '''
    INPUT:
    df - the dataframe where col is stored
    col - the categorical column you want to dummy (as a string)
    OUTPUT:
    df - the dataframe with the added columns
         for dummy variables using 1, 0, -1 coding
    '''
    for idx, val_0 in enumerate(df[col].unique()):
        if idx + 1 < df[col].nunique():            
            df[val_0] = df[col].apply(lambda x: 1 if x == val_0 else 0)
        else:    
            df[val_0] = df[col].apply(lambda x: -1 if x == val_0 else 0)
            for idx, val_1 in enumerate(df[col].unique()):
                if idx + 1 < df[col].nunique():
                    df[val_1] = df[val_0] + df[val_1]
                else:
                    del df[val_1]
    return df

In [9]:
new_df = dummy_cat(df, 'style') # Use on style
new_df.head(10)

Unnamed: 0,house_id,neighborhood,area,bedrooms,bathrooms,style,price,ranch,victorian
0,1112,B,1188,3,2,ranch,598291,1,0
1,491,B,3512,5,3,victorian,1744259,0,1
2,5952,B,1134,3,2,ranch,571669,1,0
3,3525,A,1940,4,2,ranch,493675,1,0
4,5108,B,2208,6,4,victorian,1101539,0,1
5,7507,C,1785,4,2,lodge,455235,-1,-1
6,4964,B,2996,5,3,victorian,1489871,0,1
7,7627,C,3263,5,3,victorian,821931,0,1
8,6571,A,1159,3,2,ranch,299903,1,0
9,5220,A,1248,3,2,victorian,321975,0,1


In [10]:
new_df['intercept'] = 1

lm = sm.OLS(new_df['price'], new_df[['intercept', 'ranch', 'victorian']])
results = lm.fit()
results.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.339
Model:,OLS,Adj. R-squared:,0.339
Method:,Least Squares,F-statistic:,1548.0
Date:,"Wed, 27 May 2020",Prob (F-statistic):,0.0
Time:,21:41:52,Log-Likelihood:,-86683.0
No. Observations:,6028,AIC:,173400.0
Df Residuals:,6025,BIC:,173400.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,6.421e+05,5854.251,109.677,0.000,6.31e+05,6.54e+05
ranch,-6.695e+04,8233.489,-8.131,0.000,-8.31e+04,-5.08e+04
victorian,4.04e+05,7377.372,54.763,0.000,3.9e+05,4.18e+05

0,1,2,3
Omnibus:,1340.12,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3232.81
Skew:,1.23,Prob(JB):,0.0
Kurtosis:,5.611,Cond. No.,1.84


In [11]:
style_dummies = pd.get_dummies(df['style'])
new_df2 = df2.join(style_dummies)
new_df2.head(10)

Unnamed: 0,house_id,neighborhood,area,bedrooms,bathrooms,style,price,lodge,ranch,victorian
0,1112,B,1188,3,2,ranch,598291,0,1,0
1,491,B,3512,5,3,victorian,1744259,0,0,1
2,5952,B,1134,3,2,ranch,571669,0,1,0
3,3525,A,1940,4,2,ranch,493675,0,1,0
4,5108,B,2208,6,4,victorian,1101539,0,0,1
5,7507,C,1785,4,2,lodge,455235,1,0,0
6,4964,B,2996,5,3,victorian,1489871,0,0,1
7,7627,C,3263,5,3,victorian,821931,0,0,1
8,6571,A,1159,3,2,ranch,299903,0,1,0
9,5220,A,1248,3,2,victorian,321975,0,0,1


In [12]:
new_df2['intercept'] = 1

lm2 = sm.OLS(new_df2['price'], new_df2[['intercept', 'ranch', 'victorian']])
results2 = lm2.fit()
results2.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.339
Model:,OLS,Adj. R-squared:,0.339
Method:,Least Squares,F-statistic:,1548.0
Date:,"Wed, 27 May 2020",Prob (F-statistic):,0.0
Time:,21:41:52,Log-Likelihood:,-86683.0
No. Observations:,6028,AIC:,173400.0
Df Residuals:,6025,BIC:,173400.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,3.05e+05,1.21e+04,25.120,0.000,2.81e+05,3.29e+05
ranch,2.701e+05,1.57e+04,17.153,0.000,2.39e+05,3.01e+05
victorian,7.411e+05,1.44e+04,51.396,0.000,7.13e+05,7.69e+05

0,1,2,3
Omnibus:,1340.12,Durbin-Watson:,2.004
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3232.81
Skew:,1.23,Prob(JB):,0.0
Kurtosis:,5.611,Cond. No.,4.77
