# Cross Validation

1. Reservar 1/k de los datos como muestra reservada.
2. Entrenar el modelo con los datos restantes
3. Aplicar (puntuar) el modelo a la retención de 1/k y registrar las métricas de evaluación del modelo.
4. Restaurar el primer 1/k de los datos y reservar el siguiente 1/k (excluyendo los registros que se seleccionaron la primera vez)
5. Repetir los pasos 2 y 3
6. Repetir hasta que cada registro se haya utilizado en la parte reservada
7. Promediar o combinar las métricas de evaluación del modelo.

In [135]:
# Reto: Seleccionar las caracteristicas que creen que son más importantes y hacer el proceso de regresión lineal y obtener métricas e interpretar.

In [136]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from dmba.featureSelection import stepwise_selection
from dmba.metric import AIC_score

In [137]:
data = pd.read_csv("./Datasets/house_sales.csv", delimiter="\t")

In [138]:
predictors = ['SqFtLot','Bedrooms','SqFtFinBasement','Bathrooms','NewConstruction']
outcome='AdjSalePrice'

In [139]:
data[predictors]

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,False
2,20156,4,1452,3.75,True
3,26036,4,900,1.75,False
4,8618,5,1640,3.75,False
5,8620,4,0,1.75,False
...,...,...,...,...,...
27057,5468,3,590,1.75,False
27058,23914,4,910,4.50,False
27061,11170,4,0,1.00,False
27062,6223,3,0,2.00,False


In [140]:
data = pd.get_dummies(data[predictors],drop_first=True)
data

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,False
2,20156,4,1452,3.75,True
3,26036,4,900,1.75,False
4,8618,5,1640,3.75,False
5,8620,4,0,1.75,False
...,...,...,...,...,...
27057,5468,3,590,1.75,False
27058,23914,4,910,4.50,False
27061,11170,4,0,1.00,False
27062,6223,3,0,2.00,False


In [141]:
data['NewConstruction'] = [1 if d else 0 for d in data['NewConstruction']]
data

Unnamed: 0,SqFtLot,Bedrooms,SqFtFinBasement,Bathrooms,NewConstruction
1,9373,6,0,3.00,0
2,20156,4,1452,3.75,1
3,26036,4,900,1.75,0
4,8618,5,1640,3.75,0
5,8620,4,0,1.75,0
...,...,...,...,...,...
27057,5468,3,590,1.75,0
27058,23914,4,910,4.50,0
27061,11170,4,0,1.00,0
27062,6223,3,0,2.00,0


In [142]:
# Entrenamiento con los datos

data1 = pd.read_csv("./Datasets/house_sales.csv", delimiter="\t")

model = sm.OLS(data1[outcome], data.assign(const = 1))
result = model.fit()

result.summary()

0,1,2,3
Dep. Variable:,AdjSalePrice,R-squared:,0.313
Model:,OLS,Adj. R-squared:,0.312
Method:,Least Squares,F-statistic:,2063.0
Date:,"Wed, 04 May 2022",Prob (F-statistic):,0.0
Time:,19:12:18,Log-Likelihood:,-319740.0
No. Observations:,22687,AIC:,639500.0
Df Residuals:,22681,BIC:,639500.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
SqFtLot,1.0293,0.074,13.936,0.000,0.885,1.174
Bedrooms,-1790.1275,2852.539,-0.628,0.530,-7381.299,3801.044
SqFtFinBasement,140.5929,5.192,27.079,0.000,130.416,150.769
Bathrooms,2.436e+05,3428.979,71.047,0.000,2.37e+05,2.5e+05
NewConstruction,-3.92e+04,7211.965,-5.435,0.000,-5.33e+04,-2.51e+04
const,-8159.1692,8558.826,-0.953,0.340,-2.49e+04,8616.717

0,1,2,3
Omnibus:,25897.944,Durbin-Watson:,1.205
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7339792.177
Skew:,5.561,Prob(JB):,0.0
Kurtosis:,90.412,Cond. No.,130000.0


In [143]:
def train(variables, outcome):
    if len(variables)==0:
        return None
    model = LinearRegression()
    model.fit(data1[variables], data1[outcome])
    return model

def score_model(model, variables):
    if len(variables)==0:
        return None
    return AIC_score(data1[outcome], model=model.predict(data1[variables], model))

best = stepwise_selection(data1.columns, train_model=train, score_model=score_model, verbose=True)

TypeError: train() missing 1 required positional argument: 'outcome'