In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

#! pip install statsmodels

import statsmodels.api as sm
import statsmodels

main_directory = Path.cwd().parent.as_posix()

In [20]:
df = pd.read_csv(f"{main_directory}/databases/zoopla_data.csv")
df

Unnamed: 0,type,price,station,beds,baths,receptions,post_code,location,avg_sold_price_12months
0,flat,369950.0,0.5,1,1.0,0.0,M15,manchester,200730.0
1,flat,369950.0,0.5,1,1.0,0.0,M15,manchester,200730.0
2,flat,154950.0,0.5,2,1.0,1.0,M15,manchester,200730.0
3,flat,541805.0,0.2,3,1.0,1.0,M15,manchester,200730.0
4,flat,240327.0,0.2,1,1.0,1.0,M15,manchester,200730.0
...,...,...,...,...,...,...,...,...,...
4371,detached,625000.0,0.7,3,2.0,3.0,M34,manchester,193433.0
4372,semi-detached,230000.0,0.7,3,1.0,2.0,M34,manchester,193433.0
4373,semi-detached,310000.0,0.6,4,1.0,0.0,M34,manchester,193433.0
4374,terraced,130000.0,0.2,3,1.0,1.0,M34,manchester,193433.0


In [21]:
def feature_eng(df):
    df = pd.get_dummies(df, columns = ['type'], drop_first = True, prefix = 'type')
    #df = pd.get_dummies(df, columns = ['location'], drop_first = True, prefix = 'loc')
    df = sm.add_constant(df)
    return df

In [22]:
df

Unnamed: 0,type,price,station,beds,baths,receptions,post_code,location,avg_sold_price_12months
0,flat,369950.0,0.5,1,1.0,0.0,M15,manchester,200730.0
1,flat,369950.0,0.5,1,1.0,0.0,M15,manchester,200730.0
2,flat,154950.0,0.5,2,1.0,1.0,M15,manchester,200730.0
3,flat,541805.0,0.2,3,1.0,1.0,M15,manchester,200730.0
4,flat,240327.0,0.2,1,1.0,1.0,M15,manchester,200730.0
...,...,...,...,...,...,...,...,...,...
4371,detached,625000.0,0.7,3,2.0,3.0,M34,manchester,193433.0
4372,semi-detached,230000.0,0.7,3,1.0,2.0,M34,manchester,193433.0
4373,semi-detached,310000.0,0.6,4,1.0,0.0,M34,manchester,193433.0
4374,terraced,130000.0,0.2,3,1.0,1.0,M34,manchester,193433.0


In [23]:
df = feature_eng(df)
df['avg_sold_price_12months']=(df['avg_sold_price_12months']-df['avg_sold_price_12months'].mean())/df['avg_sold_price_12months'].std()

feature_cols = list(df.columns)
feature_cols.remove('price')
feature_cols.remove('post_code') #Remove address -> not needed in the model
feature_cols.remove('station') #Remove address -> not needed in the model
feature_cols.remove('location') #Remove address -> not needed in the model


In [24]:
X = df[feature_cols]
X = sm.add_constant(X)
y = df['price']

lin_reg = sm.OLS(y, X)
results = lin_reg.fit()
df['y_pred'] = results.predict(X)

rmse = statsmodels.tools.eval_measures.rmse(y, df['y_pred'])

print(f'the RMSE is {rmse}')
print(results.params)
results.summary()

the RMSE is 104961.62459084253
const                       21138.006830
beds                        87318.249883
baths                       22209.809028
receptions                   9659.638282
avg_sold_price_12months     17163.793063
type_end terrace          -106295.984158
type_flat                   17130.260818
type_semi-detached         -74705.567790
type_terraced             -103369.691863
dtype: float64


0,1,2,3
Dep. Variable:,price,R-squared:,0.396
Model:,OLS,Adj. R-squared:,0.395
Method:,Least Squares,F-statistic:,358.4
Date:,"Sat, 08 Jan 2022",Prob (F-statistic):,0.0
Time:,21:56:20,Log-Likelihood:,-56802.0
No. Observations:,4376,AIC:,113600.0
Df Residuals:,4367,BIC:,113700.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.114e+04,1.17e+04,1.803,0.071,-1849.773,4.41e+04
beds,8.732e+04,2597.903,33.611,0.000,8.22e+04,9.24e+04
baths,2.221e+04,2829.311,7.850,0.000,1.67e+04,2.78e+04
receptions,9659.6383,2535.075,3.810,0.000,4689.605,1.46e+04
avg_sold_price_12months,1.716e+04,1620.835,10.589,0.000,1.4e+04,2.03e+04
type_end terrace,-1.063e+05,1.19e+04,-8.933,0.000,-1.3e+05,-8.3e+04
type_flat,1.713e+04,8859.221,1.934,0.053,-238.307,3.45e+04
type_semi-detached,-7.471e+04,8145.330,-9.172,0.000,-9.07e+04,-5.87e+04
type_terraced,-1.034e+05,8627.069,-11.982,0.000,-1.2e+05,-8.65e+04

0,1,2,3
Omnibus:,5162.254,Durbin-Watson:,1.379
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1271569.106
Skew:,5.887,Prob(JB):,0.0
Kurtosis:,85.676,Cond. No.,37.0
