In [None]:
# Carguemos modulos y datos
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Machine learning : scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Dataset
df = pd.read_csv("https://raw.githubusercontent.com/robintux/Datasets4StackOverFlowQuestions/master/HousePrices.csv")

In [None]:
# Nombres de las columnas
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [None]:
# Tipo de dato de cada columna
np.unique(df.dtypes.values)

array([dtype('int64'), dtype('float64'), dtype('O')], dtype=object)

In [None]:
# Construyamos una lista para las variables numericas : int64 y float64
ListaVarNumericas = []
for col in df.columns:
  if (df[col].dtype == "int64") or (df[col].dtype == "float64"):
    ListaVarNumericas.append(col)

ListaVarNumericas = ListaVarNumericas[1:]
ListaVarNumericas


['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice']

In [None]:
# Construyamos una lista para las variables de tipo object
ListaVarObject = list()
for col in df.columns:
  if df[col].dtype == "object":
    ListaVarObject.append(col)
ListaVarObject

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [None]:
# PAra las variables numericas : Valores faltantes
df[ListaVarNumericas].isnull().sum()

MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64

In [None]:
# Rellenemos los variables faltantes de las variables numericas
df[ListaVarNumericas] = df[ListaVarNumericas].fillna(df[ListaVarNumericas].mean())

In [None]:
df[ListaVarNumericas].isnull().sum().sum()

0

In [None]:
# Definamos las variables del modelo
y = df[ListaVarNumericas].SalePrice
X = df[ListaVarNumericas[:-1]]

# Particionado del dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

# Instanciamos LinearRegression
HousePrice_Model1 = LinearRegression()

# Ajustamos el modelo
HousePrice_Model1.fit(X_train, y_train)

# Calculemos pronosticos de la variable dependiente
SalePrice_Model1 = HousePrice_Model1.predict(X_test)

# Calculo del R2
HousePrice_Model1_R2 = HousePrice_Model1.score(X_train, y_train)
print("R2 : %.2f" %(HousePrice_Model1_R2))

# Calculo del MAPE
HousePrice_Model1_MAPE = metrics.mean_absolute_percentage_error(y_test, SalePrice_Model1)*100
print("MAPE : %.2f" %(HousePrice_Model1_MAPE) )

R2 : 0.80
MAPE : 11.78


In [None]:
# Empaquetemos la celda anterior en una funcion
def MakeModel_HousePrice():
  # Definamos las variables del modelo
  y = df[ListaVarNumericas].SalePrice
  X = df[ListaVarNumericas[:-1]]

  # Particionado del dataset
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8)

  # Instanciamos LinearRegression
  HousePrice_Model1 = LinearRegression()

  # Ajustamos el modelo
  HousePrice_Model1.fit(X_train, y_train)

  # Calculemos pronosticos de la variable dependiente
  SalePrice_Model1 = HousePrice_Model1.predict(X_test)

  # Calculo del R2
  HousePrice_Model1_R2 = HousePrice_Model1.score(X_train, y_train)
  # print("R2 : %.2f" %(HousePrice_Model1_R2))

  # Calculo del MAPE
  HousePrice_Model1_MAPE = metrics.mean_absolute_percentage_error(y_test, SalePrice_Model1)*100
  # print("MAPE : %.2f" %(HousePrice_Model1_MAPE) )

  return {"R2":HousePrice_Model1_R2, "MAPE":HousePrice_Model1_MAPE}

In [None]:
# Calculo de un buen representante para el R2 y para el MAPE
ListaMAPE_SalePrice = []
ListaR2_Model1 = []

for execution in range(200):
  d1 = MakeModel_HousePrice()
  ListaMAPE_SalePrice.append(d1["MAPE"])
  ListaR2_Model1.append(d1["R2"])

MAPE_final = sum(ListaMAPE_SalePrice)/len(ListaMAPE_SalePrice)
R2_final = sum(ListaR2_Model1)/len(ListaR2_Model1)

print("""
  MAPE : %.3f
  R2 : %.3f

""" %(MAPE_final,R2_final ))



  MAPE : 13.303
  R2 : 0.818


