# Modelo Predictivo de Precios de Casas

## Dependencias

In [34]:
import pandas as pd
import numpy as np
from numpy import arange 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split as tt_split 
from sklearn.preprocessing import StandardScaler as SScaler

import tensorflow as tf
from tensorflow import keras
from sklearn.linear_model import LinearRegression as Lreg
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.neural_network import MLPClassifier as MLPC
from sklearn.metrics import accuracy_score as acc
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error as MSLE #se usa este indicador porque es lo que se pide optimizar

## Carga de Datos

In [2]:
#Set de entrenamiento
df = pd.read_csv('train.csv')

In [3]:
##df.info()

In [4]:
train_info = df.describe().iloc[1,:]

In [5]:
df_test = pd.read_csv('test.csv')

In [6]:
test_info = df_test.describe().iloc[1,:]

In [7]:
#Diferencia entre set de entrenamiento y prueba
#Esta parte es necesaria dada la competencia, no seria un tema si fuera un caso real producto 
#de que nos encargariamos de que la distribucion tuviera las mismas caracteristicas en 
#entrenamiento y prueba

In [74]:
#Diferencia Absoluta
#Esto se hace para comprobar que la distribucion de entrenamiento sea igual a la de prueba
abs((train_info - test_info)/test_info).sort_values(ascending=False)

3SsnPorch        0.900149
Id               0.666438
LowQualFinSF     0.649353
PoolArea         0.581627
MiscVal          0.252354
BsmtHalfBath     0.117606
ScreenPorch      0.117406
BsmtFinSF2       0.115357
EnclosedPorch    0.094426
LotArea          0.071052
2ndFlrSF         0.064499
Fireplaces       0.054702
MoSold           0.035670
OpenPorchSF      0.034227
MasVnrArea       0.029552
BsmtUnfSF        0.023355
LotFrontage      0.021429
BsmtFullBath     0.020973
TotRmsAbvGrd     0.020769
GrLivArea        0.019796
HalfBath         0.013824
WoodDeckSF       0.011481
TotalBsmtSF      0.010813
BsmtFinSF1       0.010100
MSSubClass       0.008384
1stFlrSF         0.005268
BedroomAbvGr     0.004355
KitchenAbvGr     0.003914
OverallCond      0.003878
FullBath         0.003737
OverallQual      0.003371
YearRemodAdd     0.000606
GarageCars       0.000569
GarageArea       0.000447
GarageYrBlt      0.000397
YearBuilt        0.000046
YrSold           0.000023
SalePrice             NaN
Name: mean, 

In [11]:
#Explorando las variables

In [6]:
from pandas_profiling import ProfileReport as PR

In [7]:
prof = PR(df)
prof.to_file(output_file='train_ds.html')

prof2 = PR(df_test)
prof.to_file(output_file='test_ds.html')

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=95.0, style=ProgressStyle(descrip…




KeyboardInterrupt: 

# Funciones de Generación de Variables

In [7]:
#Variables Continuas
def get_var_con(df):
    df_con = df[['OverallQual','OverallCond','YearBuilt',\
             'TotalBsmtSF','GarageYrBlt','Fireplaces',\
             'GarageArea','FullBath','HalfBath',\
             '2ndFlrSF','GrLivArea','YearRemodAdd',\
             'MasVnrArea'
             ]]
    return df_con

In [8]:
#Diccionario de cambios
Variables_dict = [ \
[['Utilities'],{'AllPub':4,'NoSewr':3, 'NoSeWa':2, 'ELO':1}], \
[['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC',\
  'KitchenQual','FireplaceQu','GarageQual','GarageCond','PoolQC'] \
 , {'Ex':5,'Gd':4, 'TA':3, 'Fa':2, 'Po':1, np.nan : 0}], \
[['BsmtExposure'] , {'Gd':4,'Av':3, 'Mn':2, 'No':1, np.nan : 0}], \
[['BsmtFinType1','BsmtFinType2'] , {'GLQ':6,'ALQ':5, 'BLQ':4, 'Rec':3,'LwQ':2,'Unf':1,np.nan : 0}], \
[['CentralAir'],{'N':0, 'Y':1 ,np.nan : 0}], \
[['GarageFinish'] , {'Fin':3,'RFn':2, 'Unf':1, np.nan : 0}], \
[['Fence'] , {'GdPrv':4,'MnPrv':3, 'GdWo':2, 'MnWw':1, np.nan : 0}] \
]

In [9]:
def reemplazo_dict(df, x, dic):
    df[x] = df[x].map(dic).fillna(df[x])
    return df

In [10]:
def get_var_dicc(df):
    #Variables para Cambio por diccionario
    df_dict = df[['Utilities',
    'ExterQual','ExterCond','BsmtQual','BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'HeatingQC',
    'CentralAir',
    'KitchenQual',
    'FireplaceQu',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence']]
    for elemento in Variables_dict:
        for columna in elemento[0]:
            df_dict = reemplazo_dict(df_dict, columna, elemento[1])  
    return df_dict

In [11]:
def AgregaOneHotEncoding(df, x):
    lista_tipos = tuple(df[x].unique())
    sub_df = pd.DataFrame(lista_tipos, columns=[x])
    dum_df = pd.get_dummies(sub_df, columns = [x], prefix = [x] )
    sub_df = sub_df.join(dum_df)
    sub_df
    
    df_final = df.merge(sub_df, how='left', on=x)
    df_final = df_final.drop(x,1)
    return df_final

In [12]:
def get_var_OHE(df):
    #Variables para OneHotEncoding
    df_OHE = df[['MSZoning',
    'Street',
    'Alley',
    'LotShape',
    'LandContour',
    'LotConfig',
    'LandSlope',
    'Neighborhood',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Foundation',
    'Heating',
    'Electrical',
    'Functional',
    'GarageType',
    'PavedDrive',
    'MiscFeature',
    'SaleType',
    'SaleCondition']]
    
    for columna in  list(df_OHE.columns):
        df_OHE = AgregaOneHotEncoding(df_OHE, columna)
    return df_OHE

In [13]:
#Funcion que agrupa los tres tipos de variables y define los flujos para el set de entrenamiento y testeo
def Preprocessing(df,train_flg):
    
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
    df['GarageArea'] = df['GarageArea'].fillna(0)
    
        
    Scale = SScaler()

    if train_flg == 1:
        y = df['SalePrice'].values
        
    
        X1 = get_var_con(df)
        
        X2 = get_var_dicc(df)
        
        X3 = get_var_OHE(df)
 
        
        X = pd.concat([X1,X2,X3], axis=1).values

        X_out = pd.concat([X1,X2,X3], axis=1)

        X_train, X_test, y_train, y_test = tt_split(X, y, train_size= 0.67, random_state=42)



        X_train = Scale.fit_transform(X_train)

        X_test = Scale.fit_transform(X_test)

        X_train = np.nan_to_num(X_train, nan = 0)
        X_test = np.nan_to_num(X_test, nan = 0)
        
        return  X_train, X_test, y_train, y_test, X_out
    else:
        
        X1 = get_var_con(df)
        
        X2 = get_var_dicc(df)
        
        X3 = get_var_OHE(df)
 
        
        X = pd.concat([X1,X2,X3], axis=1)
        
        for element in [item for item in list(data_train.columns) if item not in list(X.columns)]:
            X[element] = 0
        
        X = X[list(data_train.columns)].values
        
        X_out = pd.concat([X1,X2,X3], axis=1)
        
        X_val = Scale.fit_transform(X)
        
        Id = df['Id'].values
        
        X_val = np.nan_to_num(X_val, nan = 0)
        
        return Id , X_val , X_out

In [14]:
#Funcion que prueba Modelos

In [15]:
def Prueba_modelo(X_train, X_test, y_train, y_test , modelo_input, nombre_modelo):
    modelo = modelo_input 
    modelo.fit(X_train, y_train)
    y_pred = modelo.predict(X_test)
    y_pred_train = modelo.predict(X_train)
    
    np.clip(a= y_pred, a_min=0, a_max=None , out=y_pred)
    np.clip(a= y_pred_train, a_min=0, a_max=None , out=y_pred_train)
    
    error = MSLE(y_test, y_pred)
    error_train = MSLE(y_train, y_pred_train)
    dif_error = error - error_train
    
    nombre = nombre_modelo
    modelo_entrenado = modelo
    print(f'Modelo: {nombre}\\ error: {error}\\ error_dif: {dif_error}')
    return [nombre, error ,error_train, dif_error , modelo]

# Modelos de Baseline

In [18]:
#Modelos
Modelos = []

In [41]:
name = 'Linear'
model = Lreg()
Modelos.append([name,model])

TypeError: __init__() got an unexpected keyword argument 'n_estimators'

In [30]:
name = 'Tree'
model = DTR()
Modelos.append([name,model])

In [None]:
name = 'MultiLayer'
model = MLPC()
Modelos.append([name,model])

In [None]:
name = f'RandomForest'
model = RFR()
Modelos.append([name,model])

In [None]:
Output = []

In [None]:
for modelo in Modelos:
    Output.append(Prueba_modelo(X_train, X_test, y_train, y_test, modelo[1],modelo[0]))

In [None]:
for modelo in Output:
    final.append(modelo[:-1])   

In [16]:
X_train, X_test, y_train, y_test, data_train = Preprocessing(df,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Tuning de RandomForest

In [26]:
from sklearn.model_selection import GridSearchCV

In [28]:
rfc=RFR(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['mae', 'mse']
}

In [29]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=42,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jobs

In [30]:
CV_rfc.best_params_

{'criterion': 'mae',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 200}

In [19]:
ModeloFinal =RFR(random_state=42, max_features='auto', n_estimators= 200, max_depth=8, criterion='mae')

In [20]:
ModeloFinal.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=200, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [21]:
MSLE(y_test, ModeloFinal.predict(X_test))

0.02409024329050108

## XGBOOST 

In [22]:
X_train, X_test, y_train, y_test, data_train = Preprocessing(df,1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
xgb = xgb.sklearn.XGBRegressor()

In [24]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

In [28]:
xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)

xgb_grid.fit(X_train,
         y_train)

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  18 out of  18 | elapsed:    6.0s finished


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=2, error_score=nan,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estima...
                                    validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=5,
             param_grid={'colsample_bytree': [0.7],
                         'learning_rate': [0.03, 0.05, 0.07],
                         'max_depth': [5, 6, 7], 'min_chil

In [29]:
xgb_grid.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 4,
 'n_estimators': 500,
 'nthread': 4,
 'objective': 'reg:linear',
 'silent': 1,
 'subsample': 0.7}

In [35]:
xgb_final =  xgb.sklearn.XGBRegressor( colsample_bytree = 0.7, learning_rate= 0.03, max_depth= 6, min_child_weight= 4,\
                                    n_estimators= 500, nthread=4, objective = 'reg:linear', silent = 1, sumsample= 0.7)

In [36]:
xgb_final.fit(X_train, y_train)

Parameters: { silent, sumsample } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=6,
             min_child_weight=4, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,
             objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, silent=1, subsample=1, sumsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [38]:
MSLE(y_test, xgb_final.predict(X_test))

0.018349679166002084

In [39]:
Id, X_val, data_test = Preprocessing(df_test,0)

Upload = xgb_final.predict(X_val)

Resultado = pd.DataFrame(list(zip(Id,Upload)), columns = ['Id','SalePrice'])

Resultado.to_csv('predicciones.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Guardando el modelo para la implementacion

In [40]:
import pickle

In [41]:
pickle.dump(xgb_final, open("ModeloFinal/xgboost_precio_casa.model", "wb"))