Importación de librerías

In [None]:
import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def rmsle_lgbm(y_pred, data):
    y_true = np.array(data.get_label())
    score = np.sqrt(np.mean(np.power(np.log1p(y_true) - np.log1p(y_pred), 2)))
    return 'rmsle', score, False

Carga de datasets

In [None]:
Data_casas= pd.read_csv('house_train_raw.csv')

Predicciones solicitadas

In [None]:
Data_casas_test = pd.read_csv('houses_test_raw.csv')
Data_casas_test['SalePrice'] = 0

In [None]:
Data_casas.describe()
Data_casas.info()
Data_casas.isna().sum()

Con la sumatoria de nulos, decidimos eliminar: PoolQC,MiscFeature,Alley,Fence,FireplaceQu y LotFrontage

In [None]:
Data_casas = Data_casas.drop(columns=['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','LotFrontage'])

Variable objetivo

In [None]:
y = Data_casas['SalePrice']
y.describe()

Búsqueda de correlación entre variables

In [None]:
corrmat = Data_casas.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)
#Sale Price correlaciona mejor con OverallQual y con GrLivArea

Matriz de correlación de Sale Price

In [None]:
k = 10 # Número de variables.
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(Data_casas[cols].values.T)
sns.set(font_scale = 1.25)
hm = sns.heatmap(cm, cbar = True, annot = True, square = True, fmt = '.2f', annot_kws = {'size': 10}, yticklabels = cols.values, xticklabels = cols.values)
plt.show()

Lista de correlaciones ordenada

In [None]:
corr = Data_casas.corr()
corr[['SalePrice']].sort_values(by = 'SalePrice',ascending = False)\
.style.background_gradient()

In [None]:
#Según el analisis, nos quedaremos solo con las siguientes variables:
x = Data_casas[['GrLivArea','BsmtFinSF1','TotalBsmtSF','OverallQual','LotArea','OverallCond','YearBuilt','GarageArea','YearRemodAdd','GarageYrBlt','SaleType','FullBath','GarageCars']]

In [None]:
#Con esto, la idea es pasar a numerica la variable SaleType:
x = pd.get_dummies(x, drop_first = True)

Parametros de nuestro modelo:

In [None]:
parametros = { 
    'boosting': 'gbdt', #gbdt
    'objective': 'regression',
    'num_leaves': 10,
    'learnnig_rage': 0.05,
    'max_bin': 255,
    'metric': 'custom',
    'verbose': -100,
    'num_iterations': 5000,
    'max_depth' : 7,
    'min_data_in_leaf': 5,
    'feature_fraction': 0.40,
    'bagging_freq': 100,
    'extra_trees' : True
}

Entrenamiento de varios modelos para simular el Cross-Validation:

In [None]:
for i in range(0,500):
    #Separamos en train / test:
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state = i)
    #Adecuamos los datos para el modelo:
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)
    
    #Entrenamos el modelo:
    modelo = lgb.train(parametros,
                     train_set=lgb_train,
                     valid_sets=lgb_eval,
                     num_boost_round=500,
                     early_stopping_rounds=3000,
                     feval=rmsle_lgbm)
    
    # predicciones:
    #y_pred = modelo.predict(x_test)

    # Hacemos backup del dataset de test:
    Data_casas_test_aux = Data_casas_test

    # Se realizan los mismos cambios de variables que el dataset de entrenamiento:
    Data_casas_test_aux = Data_casas_test_aux.drop(columns=['PoolQC','MiscFeature','Alley','Fence','FireplaceQu','LotFrontage'])
    Data_casas_test_aux = Data_casas_test_aux[['GrLivArea','BsmtFinSF1','TotalBsmtSF','OverallQual','LotArea','OverallCond','YearBuilt','GarageArea','YearRemodAdd','GarageYrBlt','SaleType','FullBath','GarageCars']]
    
    #Con esto, la idea es pasar a numerica la variable SaleType:
    Data_casas_test_aux = pd.get_dummies(Data_casas_test_aux, drop_first = True)
    
    # Predicciones de las casas desconocidas:
    pred_nuevas_casas = modelo.predict(Data_casas_test_aux)
    
    # Empezamos a acumular el precio sobre la columna SalePrice
    Data_casas_test['SalePrice'] = Data_casas_test['SalePrice'] + pred_nuevas_casas

In [None]:
# Cuando se generaron los 500 modelos, y se predijo el precio con todos, se promedia:
Data_casas_test['SalePrice'] = Data_casas_test['SalePrice']/500
Data_casas_test.to_csv('pred_test.csv')