# Cross Validation

1. Reservar 1/k de los datos como muestra reservada.
2. Entrenar el modelo con los datos restantes
3. Aplicar (puntuar) el modelo a la retención de 1/k y registrar las métricas de evaluación del modelo.
4. Restaurar el primer 1/k de los datos y reservar el siguiente 1/k (excluyendo los registros que se seleccionaron la primera vez)
5. Repetir los pasos 2 y 3
6. Repetir hasta que cada registro se haya utilizado en la parte reservada
7. Promediar o combinar las métricas de evaluación del modelo.

In [144]:
# Reto: Seleccionar las caracteristicas que creen que son más importantes y hacer el proceso de regresión lineal y obtener métricas e interpretar.

In [145]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

from dmba.featureSelection import stepwise_selection

from dmba.metric import AIC_score

In [146]:
csv = pd.read_csv("./Datasets/house_sales.csv",delimiter="\t")
csv.drop(["DocumentDate","ym","PropertyType","SalePrice"], axis=1, inplace=True)

In [147]:
predictors = ['SqFtLot','Bedrooms','SqFtFinBasement','Bathrooms','NewConstruction']
outcome='AdjSalePrice'

In [148]:
csv[predictors]

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,False
2,20156,4,1452,3.75,True
3,26036,4,900,1.75,False
4,8618,5,1640,3.75,False
5,8620,4,0,1.75,False
...,...,...,...,...,...
27057,5468,3,590,1.75,False
27058,23914,4,910,4.50,False
27061,11170,4,0,1.00,False
27062,6223,3,0,2.00,False


In [149]:
data = pd.get_dummies(csv[predictors],drop_first=True)
data

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,False
2,20156,4,1452,3.75,True
3,26036,4,900,1.75,False
4,8618,5,1640,3.75,False
5,8620,4,0,1.75,False
...,...,...,...,...,...
27057,5468,3,590,1.75,False
27058,23914,4,910,4.50,False
27061,11170,4,0,1.00,False
27062,6223,3,0,2.00,False


In [150]:
data['NewConstruction'] = [1 if d else 0 for d in data['NewConstruction']]
data

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,0
2,20156,4,1452,3.75,1
3,26036,4,900,1.75,0
4,8618,5,1640,3.75,0
5,8620,4,0,1.75,0
...,...,...,...,...,...
27057,5468,3,590,1.75,0
27058,23914,4,910,4.50,0
27061,11170,4,0,1.00,0
27062,6223,3,0,2.00,0


In [151]:
# Entrenamiento con los datos
model = sm.OLS(csv[outcome],data.assign(const=1))
results = model.fit()

results.summary()

0,1,2,3
Dep. Variable:,AdjSalePrice,R-squared:,0.313
Model:,OLS,Adj. R-squared:,0.312
Method:,Least Squares,F-statistic:,2063.0
Date:,"Thu, 05 May 2022",Prob (F-statistic):,0.0
Time:,18:46:27,Log-Likelihood:,-319740.0
No. Observations:,22687,AIC:,639500.0
Df Residuals:,22681,BIC:,639500.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
SqFtLot,1.0293,0.074,13.936,0.000,0.885,1.174
Bedrooms,-1790.1275,2852.539,-0.628,0.530,-7381.299,3801.044
SqFtFinBasement,140.5929,5.192,27.079,0.000,130.416,150.769
Bathrooms,2.436e+05,3428.979,71.047,0.000,2.37e+05,2.5e+05
NewConstruction,-3.92e+04,7211.965,-5.435,0.000,-5.33e+04,-2.51e+04
const,-8159.1692,8558.826,-0.953,0.340,-2.49e+04,8616.717

0,1,2,3
Omnibus:,25897.944,Durbin-Watson:,1.205
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7339792.177
Skew:,5.561,Prob(JB):,0.0
Kurtosis:,90.412,Cond. No.,130000.0


In [152]:

def train(variables):
    if len(variables)==0:
        return None
    model = LinearRegression()
    model.fit(csv[variables],csv[outcome])
    return model

def score_model(model,variables):
    if len(variables)==0:
        return None # Como obtener el AIC_score cuando no hay variables
    return AIC_score(csv[outcome],model.predict(csv[variables]),model)

model,variables = stepwise_selection(csv.columns,train_model=train,score_model=score_model,verbose=True,direction='backward')

variables

Variables: PropertyID, zhvi_px, zhvi_idx, AdjSalePrice, NbrLivingUnits, SqFtLot, SqFtTotLiving, SqFtFinBasement, Bathrooms, Bedrooms, BldgGrade, YrBuilt, YrRenovated, TrafficNoise, LandVal, ImpsVal, ZipCode, NewConstruction
Start: score=-918244.76, constant
Step: score=-986141.51, remove SqFtLot
Step: score=-986141.51, unchanged None


['PropertyID',
 'zhvi_px',
 'zhvi_idx',
 'AdjSalePrice',
 'NbrLivingUnits',
 'SqFtTotLiving',
 'SqFtFinBasement',
 'Bathrooms',
 'Bedrooms',
 'BldgGrade',
 'YrBuilt',
 'YrRenovated',
 'TrafficNoise',
 'LandVal',
 'ImpsVal',
 'ZipCode',
 'NewConstruction']

In [153]:
data2 = csv[variables]
data2['NewConstruction'] = [1 if d else 0 for d in data2['NewConstruction']]

model = sm.OLS(csv[outcome],data2.assign(const=1))
results = model.fit()

results.summary()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['NewConstruction'] = [1 if d else 0 for d in data2['NewConstruction']]


0,1,2,3
Dep. Variable:,AdjSalePrice,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,3.234e+28
Date:,"Thu, 05 May 2022",Prob (F-statistic):,0.0
Time:,18:46:29,Log-Likelihood:,338350.0
No. Observations:,22687,AIC:,-676700.0
Df Residuals:,22670,BIC:,-676500.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
PropertyID,2.748e-20,1.87e-19,0.147,0.883,-3.38e-19,3.93e-19
zhvi_px,-1.005e-15,1.48e-14,-0.068,0.946,-3e-14,2.8e-14
zhvi_idx,-1.431e-16,1.82e-18,-78.606,0.000,-1.47e-16,-1.4e-16
AdjSalePrice,1.0000,3.86e-15,2.59e+14,0.000,1.000,1.000
NbrLivingUnits,-6.878e-12,3.51e-09,-0.002,0.998,-6.89e-09,6.88e-09
SqFtTotLiving,-3.715e-14,1.4e-12,-0.026,0.979,-2.79e-12,2.71e-12
SqFtFinBasement,-3.984e-14,1.52e-12,-0.026,0.979,-3.02e-12,2.94e-12
Bathrooms,2.49e-11,1.25e-09,0.020,0.984,-2.42e-09,2.47e-09
Bedrooms,-3.894e-12,7.92e-10,-0.005,0.996,-1.56e-09,1.55e-09

0,1,2,3
Omnibus:,20963.77,Durbin-Watson:,0.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3719508.803
Skew:,-3.909,Prob(JB):,0.0
Kurtosis:,65.239,Cond. No.,8.9e+21
