In [1]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

from dmba.featureSelection import stepwise_selection

from dmba.metric import AIC_score

In [2]:
csv = pd.read_csv("./Datasets/house_sales.csv",delimiter="\t")
data = csv.drop(["DocumentDate","ym","PropertyType","SalePrice"], axis=1)
data

Unnamed: 0,PropertyID,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,SqFtLot,SqFtTotLiving,SqFtFinBasement,Bathrooms,Bedrooms,BldgGrade,YrBuilt,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
1,1000102,405100,0.930836,300805.0,2,9373,2400,0,3.00,6,7,1991,0,0,70000,229000,98002,False
2,1200013,404400,0.929228,1076162.0,1,20156,3764,1452,3.75,4,10,2005,0,0,203000,590000,98166,True
3,1200019,425600,0.977941,761805.0,1,26036,2060,900,1.75,4,8,1947,0,0,183000,275000,98166,False
4,2800016,418400,0.961397,442065.0,1,8618,3200,1640,3.75,5,7,1966,0,0,104000,229000,98168,False
5,2800024,351600,0.807904,297065.0,1,8620,1720,0,1.75,4,7,1948,0,0,104000,205000,98168,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27057,9842300710,318700,0.732307,443803.0,1,5468,1480,590,1.75,3,7,1951,0,0,201000,172000,98126,False
27058,9845500010,433500,0.996094,1586196.0,1,23914,4720,910,4.50,4,11,2000,0,1,703000,951000,98040,False
27061,9899200010,325300,0.747472,220744.0,1,11170,1070,0,1.00,4,6,1971,0,0,92000,130000,98055,False
27062,9900000355,400600,0.920496,342207.0,1,6223,1345,0,2.00,3,7,1939,0,0,103000,212000,98166,False


In [3]:
predictors = ['SqFtLot','Bedrooms','SqFtFinBasement','Bathrooms','NewConstruction']
outcome='AdjSalePrice'

In [4]:
data[predictors]

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,False
2,20156,4,1452,3.75,True
3,26036,4,900,1.75,False
4,8618,5,1640,3.75,False
5,8620,4,0,1.75,False
...,...,...,...,...,...
27057,5468,3,590,1.75,False
27058,23914,4,910,4.50,False
27061,11170,4,0,1.00,False
27062,6223,3,0,2.00,False


In [5]:
data_without_dummies = pd.get_dummies(data[predictors],drop_first=True)
data_without_dummies

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,False
2,20156,4,1452,3.75,True
3,26036,4,900,1.75,False
4,8618,5,1640,3.75,False
5,8620,4,0,1.75,False
...,...,...,...,...,...
27057,5468,3,590,1.75,False
27058,23914,4,910,4.50,False
27061,11170,4,0,1.00,False
27062,6223,3,0,2.00,False


In [6]:
data_without_dummies['NewConstruction'] = [1 if d else 0 for d in data_without_dummies['NewConstruction']]
data_without_dummies

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,0
2,20156,4,1452,3.75,1
3,26036,4,900,1.75,0
4,8618,5,1640,3.75,0
5,8620,4,0,1.75,0
...,...,...,...,...,...
27057,5468,3,590,1.75,0
27058,23914,4,910,4.50,0
27061,11170,4,0,1.00,0
27062,6223,3,0,2.00,0


In [7]:
# Entrenamiento con los datos
model = sm.OLS(data[outcome],data_without_dummies.assign(const=1))
results = model.fit()

results.summary()

0,1,2,3
Dep. Variable:,AdjSalePrice,R-squared:,0.313
Model:,OLS,Adj. R-squared:,0.312
Method:,Least Squares,F-statistic:,2063.0
Date:,"Thu, 12 May 2022",Prob (F-statistic):,0.0
Time:,18:17:16,Log-Likelihood:,-319740.0
No. Observations:,22687,AIC:,639500.0
Df Residuals:,22681,BIC:,639500.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
SqFtLot,1.0293,0.074,13.936,0.000,0.885,1.174
Bedrooms,-1790.1275,2852.539,-0.628,0.530,-7381.299,3801.044
SqFtFinBasement,140.5929,5.192,27.079,0.000,130.416,150.769
Bathrooms,2.436e+05,3428.979,71.047,0.000,2.37e+05,2.5e+05
NewConstruction,-3.92e+04,7211.965,-5.435,0.000,-5.33e+04,-2.51e+04
const,-8159.1692,8558.826,-0.953,0.340,-2.49e+04,8616.717

0,1,2,3
Omnibus:,25897.944,Durbin-Watson:,1.205
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7339792.177
Skew:,5.561,Prob(JB):,0.0
Kurtosis:,90.412,Cond. No.,130000.0


In [8]:
data_without_dummies.assign(const=1)

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction,const
1,9373,6,0,3.00,0,1
2,20156,4,1452,3.75,1,1
3,26036,4,900,1.75,0,1
4,8618,5,1640,3.75,0,1
5,8620,4,0,1.75,0,1
...,...,...,...,...,...,...
27057,5468,3,590,1.75,0,1
27058,23914,4,910,4.50,0,1
27061,11170,4,0,1.00,0,1
27062,6223,3,0,2.00,0,1


In [9]:
# Stepwise selection

data_stepwise = data.drop(['AdjSalePrice'],axis=1)
def train(variables):
    if len(variables)==0:
        return None
    model = LinearRegression()
    model.fit(data_stepwise[variables],data[outcome],sample_weight=weighted_data['Weight'])
    return model

def score_model(model,variables):
    if len(variables)==0:
        return None # Como obtener el AIC_score cuando no hay variables

    aic = AIC_score(data[outcome],model.predict(data_stepwise[variables]),model)
    return aic

model,variables = stepwise_selection(data_stepwise.columns,train_model=train,score_model=score_model,verbose=True,direction='backward')

variables

NameError: name 'weighted_data' is not defined

In [None]:
selected_data = data[variables]
selected_data['NewConstruction'] = [1 if d else 0 for d in selected_data['NewConstruction']]

model = sm.OLS(data[outcome],selected_data.assign(const=1))
results = model.fit()

print(results.summary())

NameError: name 'variables' is not defined

In [None]:
# weighted regression
# A partir de los años
weighted_data = csv.drop(["ym","PropertyType","SalePrice","YrBuilt"], axis=1)
weighted_data

Unnamed: 0,DocumentDate,PropertyID,zhvi_px,zhvi_idx,AdjSalePrice,NbrLivingUnits,SqFtLot,SqFtTotLiving,SqFtFinBasement,Bathrooms,Bedrooms,BldgGrade,YrRenovated,TrafficNoise,LandVal,ImpsVal,ZipCode,NewConstruction
1,2014-09-16,1000102,405100,0.930836,300805.0,2,9373,2400,0,3.00,6,7,0,0,70000,229000,98002,False
2,2006-06-16,1200013,404400,0.929228,1076162.0,1,20156,3764,1452,3.75,4,10,0,0,203000,590000,98166,True
3,2007-01-29,1200019,425600,0.977941,761805.0,1,26036,2060,900,1.75,4,8,0,0,183000,275000,98166,False
4,2008-02-25,2800016,418400,0.961397,442065.0,1,8618,3200,1640,3.75,5,7,0,0,104000,229000,98168,False
5,2013-03-29,2800024,351600,0.807904,297065.0,1,8620,1720,0,1.75,4,7,0,0,104000,205000,98168,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27057,2011-04-08,9842300710,318700,0.732307,443803.0,1,5468,1480,590,1.75,3,7,0,0,201000,172000,98126,False
27058,2007-09-28,9845500010,433500,0.996094,1586196.0,1,23914,4720,910,4.50,4,11,0,1,703000,951000,98040,False
27061,2012-07-09,9899200010,325300,0.747472,220744.0,1,11170,1070,0,1.00,4,6,0,0,92000,130000,98055,False
27062,2006-05-26,9900000355,400600,0.920496,342207.0,1,6223,1345,0,2.00,3,7,0,0,103000,212000,98166,False


In [None]:
weighted_data['Year'] = [ int(date.split('-')[0]) for date in weighted_data['DocumentDate']]

min_year = weighted_data['Year'].min()-1

weighted_data['Weight'] = weighted_data['Year']-min_year

weighted_data['Weight']

1        9
2        1
3        2
4        3
5        8
        ..
27057    6
27058    2
27061    7
27062    1
27063    2
Name: Weight, Length: 22687, dtype: int64

In [None]:
weighted_model = LinearRegression()

weighted_model.fit(weighted_data[variables],weighted_data[outcome],sample_weight=weighted_data['Weight'])

NameError: name 'variables' is not defined

In [None]:
print('Intercept',weighted_model.intercept_)
for name, coef in zip(variables,weighted_model.coef_):
    print(name,coef)

Intercept 1.979060471057892e-09
zhvi_px 6.186181839854168e-22
NbrLivingUnits 8.881784197001252e-16
SqFtLot 1.6189948517291047e-16
SqFtTotLiving 1.0000000000000013
SqFtFinBasement 3.7153481143290574e-11
Bedrooms -1.5560335201987795e-16
BldgGrade 4.167894444503079e-15
YrRenovated -8.260507395417508e-15
LandVal -6.283070901710702e-12
ImpsVal -2.3794066691069184e-13
ZipCode 2.3065776064879474e-12
NewConstruction 1.973331464280985e-15
