**Nota importante:** A la hora de quitar Outliers aumentamos la precisión de nuestro modelo, pero hay que tener en cuenta que si los quitamos en el conjunto general, es como si estubieramos "achatando" la data, por lo que el margen de error es menor y conseguimos una mejor medida. Ejemplo: el error medio de los modelos eliminando toda *y* mayor que *100* se encuentra sobre los *13* puntos en *median\_absolute\_error*

In [88]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import neighbors
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

import xgboost as xgb
from xgboost.sklearn import XGBRegressor

# Métrica
from sklearn.metrics import median_absolute_error

In [2]:
df = pd.read_csv('Train_modificado_con_pca.csv', sep = '|', encoding='utf-8')

X = df.drop(['Unnamed: 0', 'TARGET'],axis = 1)
y = df.TARGET.copy()

In [91]:
models = {
    'KNeighborsRegressor5':neighbors.KNeighborsRegressor(5, weights='uniform'),# Admite 'distance'
    'KNeighborsRegressor10':neighbors.KNeighborsRegressor(10, weights='uniform'),
    'KNeighborsRegressor15':neighbors.KNeighborsRegressor(15, weights='uniform'),
    'KNeighborsRegressor25':neighbors.KNeighborsRegressor(25, weights='uniform'),
    'KNeighborsRegressor15dist':neighbors.KNeighborsRegressor(15, weights='distance'),
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'DecisionTreeRegressor10':tree.DecisionTreeRegressor(max_depth = 10),
    'DecisionTreeRegressor20':tree.DecisionTreeRegressor(max_depth = 20),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'RandomForestRegressor40':RandomForestRegressor(max_depth=3, n_estimators = 40, random_state=0),
    'RandomForestRegressor50':RandomForestRegressor(max_depth=10, n_estimators = 50, random_state=0),
    'RandomForestRegressor100':RandomForestRegressor(max_depth=10, n_estimators = 100, random_state=0),
    'RandomForestRegressor150':RandomForestRegressor(max_depth=10, n_estimators = 150, random_state=0),
    'ExtraTreesRegressor10':ExtraTreesRegressor(n_estimators=10,random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'ExtraTreesRegressor100':ExtraTreesRegressor(n_estimators=100, random_state=0),
    'ExtraTreesRegressor150':ExtraTreesRegressor(n_estimators=150, random_state=0),
    'GradientBoostingRegressor30':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=1, random_state=0, loss='ls'),
    'GradientBoostingRegressor50':GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=1, random_state=0, loss='ls'),
    'GradientBoostingRegressor100':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls'),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'GradientBoostingRegressor50_md5':GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'GradientBoostingRegressor100_md5':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'GradientBoostingRegressor30_md10':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=10, random_state=0, loss='ls'),
    'GradientBoostingRegressor50_md10':GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=10, random_state=0, loss='ls'),
    'GradientBoostingRegressor100_md10':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=0, loss='ls'),
    'GradientBoostingRegressor30_nols':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=1, random_state=0),
    'GradientBoostingRegressor50_nols':GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=1, random_state=0),
    'GradientBoostingRegressor100_nols':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0),
    'GradientBoostingRegressor30_md5_nols':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0),
    'GradientBoostingRegressor50_md5_nols':GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=5, random_state=0),
    'GradientBoostingRegressor100_md5_nols':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0),
    'GradientBoostingRegressor30_md10_nols':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=10, random_state=0),
    'GradientBoostingRegressor50_md10_nols':GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=10, random_state=0),
    'GradientBoostingRegressor100_md10_nols':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=0),
    'XGB25':XGBRegressor(max_depth = 10, n_estimators=25, random_state=7),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'XGB60':XGBRegressor(max_depth = 10, n_estimators=60, random_state=7),
    'XGB100':XGBRegressor(max_depth = 10, n_estimators=100, random_state=7),
    'XGB40_md15':XGBRegressor(max_depth = 15, n_estimators=50, random_state=7)
    }

In [92]:
def EntrenarModelos(X, y, models, drop_vars):
    X_train, X_test, y_train, y_test = train_test_split(X.drop(drop_vars, axis = 1), y, test_size=0.3, random_state=7)
    
    y_test_predict = {}
    errores = {}
    
    for name, model in models.items():
        #try:
        model = model.fit(X_train, y_train)
        y_test_predict[name] = model.predict(X_test)
        errores[name] = median_absolute_error(np.exp(y_test)-1, np.exp(y_test_predict[name])-1)
        
        print(name,': ', errores[name], sep = '')

In [65]:
EntrenarModelos(X[['HY_metros_utiles', 'GA_page_views', 'HY_precio', 'PV_cantidad_imagenes',
                  'GA_exit_rate', 'GA_quincena_ini', 'GA_quincena_ult']], y, models, [])

KNeighborsRegressor5: 25.335204324370252
KNeighborsRegressor10: 23.732550395459665
KNeighborsRegressor15: 23.41459256371895
KNeighborsRegressor25: 23.196018310061106
KNeighborsRegressor15dist: 23.045557634410315
DecisionTreeRegressor5: 19.788694879834743
DecisionTreeRegressor10: 20.910586663131248
DecisionTreeRegressor20: 27.96999999999997
RandomForestRegressor20: 19.264756554330653
RandomForestRegressor40: 19.207574952085714
RandomForestRegressor50: 18.883811313414633
RandomForestRegressor100: 18.82939608390021
RandomForestRegressor150: 19.02535867284067
ExtraTreesRegressor10: 20.318600935270645
ExtraTreesRegressor50: 18.99349028251843
ExtraTreesRegressor100: 18.741159572500237
ExtraTreesRegressor150: 18.566636496684666
GradientBoostingRegressor30: 20.16491679131419
GradientBoostingRegressor50: 19.9171002938324
GradientBoostingRegressor100: 19.76170664993758
GradientBoostingRegressor30_md5: 18.83199581621777
GradientBoostingRegressor50_md5: 19.159453373848038
GradientBoostingRegressor

In [57]:
EntrenarModelos(X, y, models, ['PV_idea_pca', 'PV_idea_pca2', 'HY_metros_totales', 'HY_num_terrazas',
       'HY_ascensor', 'HY_trastero', 'HY_precio_anterior', 'PV_ind_elasticidad','PV_precio_anterior', 
       'PV_longitud_distribucion'])

KNeighborsRegressor5: 25.28569953035789
KNeighborsRegressor10: 24.05007265581657
KNeighborsRegressor15: 23.740993570275634
KNeighborsRegressor25: 23.30775115037642
KNeighborsRegressor15dist: 23.46014218169929
DecisionTreeRegressor5: 19.750837066690167
DecisionTreeRegressor10: 26.45000000000001
RandomForestRegressor20: 19.172022758971586
RandomForestRegressor40: 19.239924340953973
RandomForestRegressor50: 18.56607118956455
RandomForestRegressor100: 18.49729716739766
RandomForestRegressor150: 18.568090029320828
ExtraTreesRegressor10: 19.875597288616703
ExtraTreesRegressor50: 19.022971811585457
ExtraTreesRegressor100: 18.943868611957683
ExtraTreesRegressor150: 18.79043867237103
GradientBoostingRegressor30: 20.163402550669566
GradientBoostingRegressor50: 19.85848210390693
GradientBoostingRegressor100: 19.878878692761162
GradientBoostingRegressor30_md5: 19.053975729660394
GradientBoostingRegressor50_md5: 18.93921632472602
GradientBoostingRegressor100_md5: 18.921776791778726
GradientBoosting

In [21]:
EntrenarModelos(X, y, {'XGB1000':XGBRegressor(n_estimators=1000, random_state=7)}
                , ['PV_idea_pca', 'PV_idea_pca2', 'HY_metros_totales', 'HY_num_terrazas',
       'HY_ascensor', 'HY_trastero', 'HY_precio_anterior', 'PV_ind_elasticidad','PV_precio_anterior', 
       'PV_longitud_distribucion'])

XGB1000: 18.776891708374023


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

gbm = xgb.XGBRegressor(objective='survival:cox',
                       booster='gblinear',
                       base_score=1,
                       n_estimators=1000).fit(X_train, np.exp(y_train)-1)

y_pred = gbm.predict(X_test, output_margin=True)

#median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)
median_absolute_error(np.exp(y_test)-1,y_pred)

54.98440202236172

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

gbm = xgb.XGBRegressor(objective='survival:cox',
                       booster='gbtree',
                       max_depth = 10,
                       n_estimators=50).fit(X_train, y_train)

y_pred = gbm.predict(X_test, output_margin=True)

#median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)
median_absolute_error(np.exp(y_test)-1,np.exp(y_pred)-1)

56.76767061233522

In [3]:
def FitEnsembles(X,y,base_models,ensemble_models):
    '''
    X,y --> nuestra data
    base_models --> Los modelos de los que se nutrirá nuestro ensemble
    ensemble_models --> Modelos que son la capa final del ensemble
    '''
    
    X_train_ensemble, X_test_ensemble, y_train_ensemble, y_test_ensemble = train_test_split(X, y, test_size=0.4, random_state=7)
    X_train, X_test, y_train, y_test = train_test_split(X_train_ensemble, y_train_ensemble, test_size=0.3, random_state=7)

    # Dataframe donde almacenaremos los resultados de nuestros modelos
    pred_ensemble_models = pd.DataFrame(data = np.zeros((X_test.shape[0],len(base_models))), 
                                        columns = list(base_models.keys()))
    # Dataframe donde almacenaremos los resultados de nuestros modelos
    pred_ensemble_models_test = pd.DataFrame(data = np.zeros((X_test_ensemble.shape[0],len(base_models))), 
                                    columns = list(base_models.keys()))
    
    # Procedemos a entrenar los base_models
    for name, model in base_models.items():
        base_models[name] = model.fit(X_train, y_train)
        y_test_predict = base_models[name].predict(X_test)
        # Almacenamos la predicción de cada modelo
        pred_ensemble_models[name] = y_test_predict
        pred_ensemble_models_test[name] = base_models[name].predict(X_test_ensemble)
        
        print(name,': ', median_absolute_error(np.exp(y_test_ensemble)-1, np.exp(pred_ensemble_models_test[name])-1), sep = '')
    
    print('-'*25+' Ensemble Time '+'-'*25)
    # Predict con el ensemble
    for name, model in ensemble_models.items():
        ensemble_models[name] = ensemble_models[name].fit(pred_ensemble_models, y_test)
        
        y_final_pred = ensemble_models[name].predict(pred_ensemble_models_test)
        print(name,': ', median_absolute_error(np.exp(y_test_ensemble)-1, np.exp(y_final_pred)-1), sep = '')
        

In [63]:
models2 = {
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'DecisionTreeRegressor10':tree.DecisionTreeRegressor(max_depth = 10),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'RandomForestRegressor100':RandomForestRegressor(max_depth=10, n_estimators = 100, random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'XGB60':XGBRegressor(max_depth = 10, n_estimators=60, random_state=7)
    }
models3 = {
    'DecisionTreeRegressor2':tree.DecisionTreeRegressor(max_depth = 2),
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor10_md2':RandomForestRegressor(max_depth=2, n_estimators = 10, random_state=0),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'RandomForestRegressor100':RandomForestRegressor(max_depth=10, n_estimators = 100, random_state=0),
    'ExtraTreesRegressor10':ExtraTreesRegressor(n_estimators=10,random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'XGB60':XGBRegressor(max_depth = 10, n_estimators=60, random_state=7)
    }

FitEnsembles(X[['HY_metros_utiles', 'GA_page_views', 'HY_precio', 'PV_cantidad_imagenes',
                  'GA_exit_rate', 'GA_quincena_ini', 'GA_quincena_ult']], y,
             base_models = models2, ensemble_models = models3)

DecisionTreeRegressor5: 21.063599910030348
DecisionTreeRegressor10: 22.01110696831278
RandomForestRegressor20: 19.372475940478832
RandomForestRegressor100: 19.26256822222018
ExtraTreesRegressor50: 19.92827828170691
GradientBoostingRegressor30_md5: 19.946835169495973
XGB46: 19.309225502014158
XGB60: 19.94895050048829
------------------------- Ensemble Time -------------------------
KNeighborsRegressor5: 21.082019094108173
KNeighborsRegressor10: 20.09800783504063
KNeighborsRegressor15: 19.68056200099302
KNeighborsRegressor25: 18.992289615862127
KNeighborsRegressor15dist: 19.8575438176082
DecisionTreeRegressor5: 19.75
DecisionTreeRegressor10: 22.26581188110176
DecisionTreeRegressor20: 26.35
RandomForestRegressor20: 19.061197238625546
RandomForestRegressor40: 18.863888858686458
RandomForestRegressor50: 19.65385773634836
RandomForestRegressor100: 19.6358077222474
RandomForestRegressor150: 19.670201942488664
ExtraTreesRegressor10: 21.235944741935334
ExtraTreesRegressor50: 20.549132922981983


# Clustering 

In [62]:
from sklearn.cluster import KMeans

for i in [2,4,6,10]:
    print('-'*25+' {} '.format(i)+'-'*25)
    kmeans = KMeans(n_clusters=i, random_state=0).fit(X)
    X['PV_kmeans'] = kmeans.labels_
    
    EntrenarModelos(X, y, models, ['PV_idea_pca', 'PV_idea_pca2', 'HY_metros_totales', 'HY_num_terrazas',
           'HY_ascensor', 'HY_trastero', 'HY_precio_anterior', 'PV_ind_elasticidad','PV_precio_anterior', 
           'PV_longitud_distribucion'])

------------------------- 2 -------------------------
KNeighborsRegressor5: 25.28569953035789
KNeighborsRegressor10: 24.05007265581657
KNeighborsRegressor15: 23.740993570275634
KNeighborsRegressor25: 23.30775115037642
KNeighborsRegressor15dist: 23.46014218169929
DecisionTreeRegressor5: 19.750837066689805
DecisionTreeRegressor10: 20.693648336791966
DecisionTreeRegressor20: 26.760000000000034
RandomForestRegressor20: 19.172022758971586
RandomForestRegressor40: 19.23992434095395
RandomForestRegressor50: 18.521233012652772
RandomForestRegressor100: 18.620476591228808
RandomForestRegressor150: 18.620877540710538
ExtraTreesRegressor10: 19.338455769706428
ExtraTreesRegressor50: 18.516684899104504
ExtraTreesRegressor100: 18.772514044261307
ExtraTreesRegressor150: 18.748164452845483
GradientBoostingRegressor30: 20.163402550669566
GradientBoostingRegressor50: 19.858482103906873
GradientBoostingRegressor100: 19.80370243003101
GradientBoostingRegressor30_md5: 19.122257633282615
GradientBoostingReg

In [69]:
EntrenarModelos(X[np.exp(y)-1<=200][['HY_metros_utiles', 'GA_page_views', 'HY_precio', 'PV_cantidad_imagenes',
                  'GA_exit_rate', 'GA_quincena_ini', 'GA_quincena_ult']], y[np.exp(y)-1<=200], models, [])

KNeighborsRegressor5: 22.847764874669686
KNeighborsRegressor10: 22.065836022935432
KNeighborsRegressor15: 21.463244723813766
KNeighborsRegressor25: 21.73459097677236
KNeighborsRegressor15dist: 21.505503135942174
DecisionTreeRegressor5: 18.22678446080793
DecisionTreeRegressor10: 19.10487425310106
DecisionTreeRegressor20: 24.4733629623258
RandomForestRegressor20: 17.516599038626925
RandomForestRegressor40: 17.652760252882466
RandomForestRegressor50: 17.077362028923147
RandomForestRegressor100: 17.234448777305328
RandomForestRegressor150: 17.392189004063116
ExtraTreesRegressor10: 18.854979688146578
ExtraTreesRegressor50: 18.01307830581301
ExtraTreesRegressor100: 17.74103535233655
ExtraTreesRegressor150: 17.70387266247318
GradientBoostingRegressor30: 18.38401639450595
GradientBoostingRegressor50: 18.231492082094
GradientBoostingRegressor100: 18.02409065967376
GradientBoostingRegressor30_md5: 17.539776816582894
GradientBoostingRegressor50_md5: 17.446090625053937
GradientBoostingRegressor100

In [93]:
EntrenarModelos(X[np.exp(y)-1<=172][['HY_metros_utiles', 'GA_page_views', 'HY_precio', 'PV_cantidad_imagenes',
                  'GA_exit_rate', 'GA_quincena_ini', 'GA_quincena_ult']], y[np.exp(y)-1<=172], models, [])

KNeighborsRegressor5: 22.039799274538808
KNeighborsRegressor10: 21.17396790183258
KNeighborsRegressor15: 20.838028574996496
KNeighborsRegressor25: 20.75587204792587
KNeighborsRegressor15dist: 20.94536802283134
DecisionTreeRegressor5: 17.470494662045944
DecisionTreeRegressor10: 18.428009978198702
DecisionTreeRegressor20: 23.770000000000017
RandomForestRegressor20: 16.918607247292726
RandomForestRegressor40: 16.82919679618275
RandomForestRegressor50: 17.132772839492958
RandomForestRegressor100: 16.862261122426347
RandomForestRegressor150: 16.76267983805804
ExtraTreesRegressor10: 17.7792824013638
ExtraTreesRegressor50: 17.14800194934731
ExtraTreesRegressor100: 17.083850245078555
ExtraTreesRegressor150: 17.078946775482137
GradientBoostingRegressor30: 17.685462614744477
GradientBoostingRegressor50: 17.385016901052296
GradientBoostingRegressor100: 17.153620421072695
GradientBoostingRegressor30_md5: 16.893944057520628
GradientBoostingRegressor50_md5: 16.977104910944654
GradientBoostingRegress

In [94]:
X[np.exp(y)-1<=172].shape

(7827, 28)

In [75]:
EntrenarModelos(X[np.exp(y)-1<=200], y[np.exp(y)-1<=200], models, [])

KNeighborsRegressor5: 23.03302207778817
KNeighborsRegressor10: 21.251792561086393
KNeighborsRegressor15: 21.454677310810908
KNeighborsRegressor25: 21.26452965636928
KNeighborsRegressor15dist: 21.22251775352023
DecisionTreeRegressor5: 18.054235179784484
DecisionTreeRegressor10: 19.335189607170218
DecisionTreeRegressor20: 24.503505724148674
RandomForestRegressor20: 17.458912367143473
RandomForestRegressor40: 17.65314276772361
RandomForestRegressor50: 17.174167377108624
RandomForestRegressor100: 17.325746699523872
RandomForestRegressor150: 17.085890829080263
ExtraTreesRegressor10: 17.703134592902106
ExtraTreesRegressor50: 17.483516469479163
ExtraTreesRegressor100: 17.19423585567175
ExtraTreesRegressor150: 17.01482687807235
GradientBoostingRegressor30: 18.32805468901148
GradientBoostingRegressor50: 17.99676647307614
GradientBoostingRegressor100: 17.933807072325536
GradientBoostingRegressor30_md5: 17.671536501090493
GradientBoostingRegressor50_md5: 17.524663782441607
GradientBoostingRegress

# Algoritmo forward

In [74]:
def Entrenar(X,y,model):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)
    
    model = model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    error = median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)
    
    return error

def EntrenarForward(X, y, model, ini_vars):
    '''
    X,y --> Nuestra data
    model --> un modelo
    ini_vars --> variables con las que comenzamos
    '''
    # Variable que indica si hemos terminado
    fin = False
    
    # Variables con las que estamos trabajando
    current_vars = ini_vars
    all_vars = X.columns
    possible_vars = np.setdiff1d(all_vars, current_vars)
    
    while not fin and len(possible_vars) > 0: # Lo que antes pase
        possible_vars = np.setdiff1d(all_vars, current_vars)
        
        if len(current_vars) == 0:
            # Si no tenemos variables, cuestro error es inf
            best_error = np.inf
        else:
            base_error = Entrenar(X[current_vars], y, model)
            best_error = base_error
        
        best_var = ''
        for var in possible_vars:
            var_error = Entrenar(X[current_vars+[var]],y,model)
            
            if var_error < best_error:
                best_error = var_error
                best_var = var
        
        print('Best var: {} --> {:.4f}'.format(best_var, best_error))
        # Si tenemos una best_var        
        if len(best_var) > 0:
            current_vars += [best_var]
        else: 
            fin = True
            
    print('Best vars:', current_vars)
    print('Best error:', best_error)
    
    return best_error

In [5]:
EntrenarForward(X, y, XGBRegressor(max_depth = 10, n_estimators=46, random_state=7), ['HY_precio'])

Best var: GA_exit_rate --> 20.4554
Best var: PV_pca2 --> 19.5107
Best var: PV_longitud_descripcion --> 19.0807
Best var: GA_page_views --> 18.5486
Best var: PV_cantidad_imagenes --> 18.2466
Best var: GA_quincena_ini --> 17.9110
Best var: PV_idea_pca --> 17.6864
Best var:  --> 17.6864
Best vars: ['HY_precio', 'GA_exit_rate', 'PV_pca2', 'PV_longitud_descripcion', 'GA_page_views', 'PV_cantidad_imagenes', 'GA_quincena_ini', 'PV_idea_pca']
Best error: 17.68644958496104


17.68644958496104

In [8]:
EntrenarForward(X, y, XGBRegressor(max_depth = 10, 
    n_estimators = 46, 
    reg_lambda = 0.8,
    learning_rate = 0.1,
    subsample = 0.5,
    colsample_bytree = 0.6,
    objective = 'reg:linear',
    random_state = 7),
                ['HY_precio'])

Best var: GA_exit_rate --> 19.9742
Best var: PV_pca1 --> 19.2516
Best var: GA_page_views --> 19.0631
Best var: GA_quincena_ini --> 18.7087
Best var: PV_cantidad_imagenes --> 18.6170
Best var: HY_metros_totales --> 18.2472
Best var:  --> 18.2472
Best vars: ['HY_precio', 'GA_exit_rate', 'PV_pca1', 'GA_page_views', 'GA_quincena_ini', 'PV_cantidad_imagenes', 'HY_metros_totales']
Best error: 18.24720764160149


18.24720764160149

In [77]:
EntrenarForward(X, y, XGBRegressor(max_depth = 10, 
    n_estimators = 46, 
    reg_lambda = 0.8,
    learning_rate = 0.1,
    subsample = 0.5,
    colsample_bytree = 0.6,
    objective = 'reg:linear',
    random_state = 7),
                ['HY_precio'])

Best var: GA_exit_rate --> 19.9742
Best var: PV_pca1 --> 19.2516
Best var: GA_page_views --> 19.0631
Best var: GA_quincena_ini --> 18.7087
Best var: PV_cantidad_imagenes --> 18.6170
Best var: HY_metros_totales --> 18.2472
Best var:  --> 18.2472
Best vars: ['HY_precio', 'GA_exit_rate', 'PV_pca1', 'GA_page_views', 'GA_quincena_ini', 'PV_cantidad_imagenes', 'HY_metros_totales']
Best error: 18.24720764160149


18.24720764160149

In [80]:
EntrenarForward(X_norm, y, XGBRegressor(max_depth = 10, 
    n_estimators = 46, 
    reg_lambda = 0.8,
    learning_rate = 0.1,
    subsample = 0.5,
    colsample_bytree = 0.6,
    objective = 'reg:linear',
    random_state = 7),
                [])

Best var: GA_exit_rate --> 19.4558
Best var: PV_pca2 --> 18.9989
Best var: PV_precio_anterior --> 18.9127
Best var: HY_num_terrazas --> 18.6914
Best var:  --> 18.6914
Best vars: ['GA_exit_rate', 'PV_pca2', 'PV_precio_anterior', 'HY_num_terrazas']
Best error: 18.691410522460913


18.691410522460913

In [81]:
EntrenarForward(X, y, XGBRegressor(max_d = 9,
                                        min_child = 7,
                                        subs = 1.0,
                                        col_sample = 0.5,
                                        n_est = 80,
                                        learn_rate = 0.05 ,
                                        rg_alpha = 0.4,
                                        rg_lambda = 0.5,
                                        random_state = 7),
                [])

Best var: GA_exit_rate --> 19.8759
Best var: PV_pca2 --> 19.0018
Best var: PV_cantidad_imagenes --> 18.6689
Best var: HY_ascensor --> 18.5837
Best var:  --> 18.5837
Best vars: ['GA_exit_rate', 'PV_pca2', 'PV_cantidad_imagenes', 'HY_ascensor']
Best error: 18.583722534179728


18.583722534179728

In [86]:
EntrenarForward(X_norm, y, XGBRegressor(max_d = 9,
                                        min_child = 7,
                                        subs = 1.0,
                                        col_sample = 0.5,
                                        n_est = 80,
                                        learn_rate = 0.05 ,
                                        rg_alpha = 0.4,
                                        rg_lambda = 0.5,
                                        random_state = 7),
                [])

Best var: GA_exit_rate --> 19.8759
Best var: PV_pca2 --> 19.0018
Best var: PV_cantidad_imagenes --> 18.6689
Best var: HY_ascensor --> 18.5837
Best var:  --> 18.5837
Best vars: ['GA_exit_rate', 'PV_pca2', 'PV_cantidad_imagenes', 'HY_ascensor']
Best error: 18.583722534179728


18.583722534179728

In [93]:
models = {
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'DecisionTreeRegressor10':tree.DecisionTreeRegressor(max_depth = 10),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'RandomForestRegressor40':RandomForestRegressor(max_depth=3, n_estimators = 40, random_state=0),
    'RandomForestRegressor50':RandomForestRegressor(max_depth=10, n_estimators = 50, random_state=0),
    'RandomForestRegressor100':RandomForestRegressor(max_depth=10, n_estimators = 100, random_state=0),
    'RandomForestRegressor150':RandomForestRegressor(max_depth=10, n_estimators = 150, random_state=0),
    'ExtraTreesRegressor10':ExtraTreesRegressor(n_estimators=10,random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'ExtraTreesRegressor100':ExtraTreesRegressor(n_estimators=100, random_state=0),
    'ExtraTreesRegressor150':ExtraTreesRegressor(n_estimators=150, random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'GradientBoostingRegressor50_md5':GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'GradientBoostingRegressor100_md5':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'GradientBoostingRegressor30_md10':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=10, random_state=0, loss='ls'),
    'GradientBoostingRegressor50_md10':GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=10, random_state=0, loss='ls'),
    'GradientBoostingRegressor100_md10':GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=10, random_state=0, loss='ls'),
    'XGB25':XGBRegressor(max_depth = 10, n_estimators=25, random_state=7),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'XGB60':XGBRegressor(max_depth = 10, n_estimators=60, random_state=7),
    'XGB100':XGBRegressor(max_depth = 10, n_estimators=100, random_state=7),
    'XGB40_md15':XGBRegressor(max_depth = 15, n_estimators=50, random_state=7)
    }

In [94]:
errores = []
for name, model in models.items():
    #try:
    print('*'*25+' {} '.format(name)+'*'*25)
    error = EntrenarForward(X, y, model, ['HY_precio'])
    errores.append(error)

list1, list2 = zip(*sorted(zip(errores, list(models.keys()))))
print('Best model:', list2[0],'--->', list1[0])

************************* DecisionTreeRegressor5 *************************
Best var: GA_page_views --> 20.3559
Best var: GA_exit_rate --> 19.0029
Best var: IDEA_pc_1990_99 --> 18.9813
Best var: HY_trastero --> 18.9791
Best var: GA_quincena_ult --> 18.9457
Best var: PV_pca2 --> 18.8873
Best var: GA_mean_bounce --> 18.8873
Best var: HY_ascensor --> 18.8873
Best var:  --> 18.8873
Best vars: ['HY_precio', 'GA_page_views', 'GA_exit_rate', 'IDEA_pc_1990_99', 'HY_trastero', 'GA_quincena_ult', 'PV_pca2', 'GA_mean_bounce', 'HY_ascensor']
Best error: 18.887303633375527
************************* DecisionTreeRegressor10 *************************
Best var: GA_exit_rate --> 21.2007
Best var: PV_longitud_distribucion --> 20.7848
Best var: HY_num_terrazas --> 20.7848
Best var:  --> 20.7771
Best vars: ['HY_precio', 'GA_exit_rate', 'PV_longitud_distribucion', 'HY_num_terrazas']
Best error: 20.777084847169178
************************* RandomForestRegressor20 *************************
Best var: GA_page_vi

Best var: GA_page_views --> 19.1198
Best var: HY_metros_utiles --> 18.4707
Best var: PV_ind_elasticidad --> 18.3096
Best var: PV_pca3 --> 18.2652
Best var: IDEA_pc_1980_89 --> 18.0938
Best var:  --> 18.0938
Best vars: ['HY_precio', 'GA_page_views', 'HY_metros_utiles', 'PV_ind_elasticidad', 'PV_pca3', 'IDEA_pc_1980_89']
Best error: 18.093776626586916
************************* XGB46 *************************
Best var: GA_exit_rate --> 20.4554
Best var: PV_pca2 --> 19.5107
Best var: PV_longitud_descripcion --> 19.0807
Best var: GA_page_views --> 18.5486
Best var: PV_cantidad_imagenes --> 18.2466
Best var: GA_quincena_ini --> 17.9110
Best var: PV_idea_pca --> 17.6864
Best var:  --> 17.6864
Best vars: ['HY_precio', 'GA_exit_rate', 'PV_pca2', 'PV_longitud_descripcion', 'GA_page_views', 'PV_cantidad_imagenes', 'GA_quincena_ini', 'PV_idea_pca']
Best error: 17.68644958496104
************************* XGB60 *************************
Best var: GA_exit_rate --> 20.6233
Best var: PV_pca3 --> 19.743

In [96]:
list1, list2 = zip(*sorted(zip(errores, list(models.keys()))))
print('Best model:', list2[0],'--->', list1[0])

Best model: XGB46 ---> 17.68644958496104


In [103]:
models2 = {}

for i in range(10,155,20):
    models2['KNeighborsRegressor{}'.format(i)] = neighbors.KNeighborsRegressor(i, weights='distance')
    
errores2 = []
for name, model in models2.items():
    #try:
    print('*'*25+' {} '.format(name)+'*'*25)
    error = EntrenarForward(X, y, model, ['HY_precio'])
    errores2.append(error)
    
list1, list2 = zip(*sorted(zip(errores2, list(models2.keys()))))
print('Best model:', list2[0],'--->', list1[0])

************************* KNeighborsRegressor10 *************************
Best var: GA_page_views --> 24.0929
Best var: GA_quincena_ini --> 23.1816
Best var: PV_ind_elasticidad --> 23.0608
Best var: PV_kmeans --> 23.0588
Best var: IDEA_pc_2000_10 --> 23.0275
Best var: IDEA_pc_1970_79 --> 23.0274
Best var: IDEA_pc_1990_99 --> 23.0274
Best var:  --> 23.0274
Best vars: ['HY_precio', 'GA_page_views', 'GA_quincena_ini', 'PV_ind_elasticidad', 'PV_kmeans', 'IDEA_pc_2000_10', 'IDEA_pc_1970_79', 'IDEA_pc_1990_99']
Best error: 23.02743565931774
************************* KNeighborsRegressor30 *************************
Best var: PV_idea_pca --> 22.9001
Best var: HY_precio_anterior --> 22.4481
Best var: GA_exit_rate --> 22.2789
Best var: PV_cantidad_imagenes --> 22.2467
Best var: GA_quincena_ini --> 22.2352
Best var: PV_longitud_descripcion --> 22.2195
Best var: PV_pca2 --> 22.2195
Best var: PV_precio_anterior --> 22.2195
Best var: HY_trastero --> 22.2195
Best var:  --> 22.2195
Best vars: ['HY_prec

# Neural network

In [15]:
from sklearn.neural_network import MLPRegressor

X_norm = X.copy()
X_norm['PV_precio_anterior'] = X_norm['PV_precio_anterior'].apply(np.sum)

for var in X_norm.columns:
    X_norm[var] = (X_norm[var] - X_norm[var].min())/(X_norm[var].max() - X_norm[var].min())

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.3, random_state=7)


mlp = MLPRegressor(hidden_layer_sizes = (X_norm.shape[1], 20, 20), max_iter = 1250, random_state=0)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)

21.08579306284118

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_norm.drop(['PV_idea_pca'], axis = 1), y, test_size=0.3, random_state=7)


mlp = MLPRegressor(hidden_layer_sizes = (X_norm.shape[1],20, 20), max_iter = 1250, random_state=0)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)

20.684910053600355

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_norm.drop(['PV_idea_pca'], axis = 1), y, test_size=0.3, random_state=7)


mlp = MLPRegressor(hidden_layer_sizes = (20, 20), max_iter = 10000, random_state=0)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)

19.61167966626285

In [107]:
models2 = {
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'DecisionTreeRegressor10':tree.DecisionTreeRegressor(max_depth = 10),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'RandomForestRegressor100':RandomForestRegressor(max_depth=10, n_estimators = 100, random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'XGB60':XGBRegressor(max_depth = 10, n_estimators=60, random_state=7),
    'nn250': MLPRegressor(hidden_layer_sizes = (X_norm.shape[1], 30, 20), max_iter = 250),
    'nn1250': MLPRegressor(hidden_layer_sizes = (X_norm.shape[1], 30, 20), max_iter = 1250),
    }
models3 = {
    'DecisionTreeRegressor2':tree.DecisionTreeRegressor(max_depth = 2),
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor10_md2':RandomForestRegressor(max_depth=2, n_estimators = 10, random_state=0),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'RandomForestRegressor100':RandomForestRegressor(max_depth=10, n_estimators = 100, random_state=0),
    'ExtraTreesRegressor10':ExtraTreesRegressor(n_estimators=10,random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'XGB60':XGBRegressor(max_depth = 10, n_estimators=60, random_state=7),
    'nn20': MLPRegressor(hidden_layer_sizes = (20, 20), max_iter = 250, random_state=0),
    'nn10': MLPRegressor(hidden_layer_sizes = (10, 10), max_iter = 250, random_state=0),
    'nn20_10': MLPRegressor(hidden_layer_sizes = (20, 10), max_iter = 250, random_state=0),
    }

FitEnsembles(X_norm[['HY_precio', 'GA_exit_rate', 'PV_pca2', 'PV_longitud_descripcion', 
                'GA_page_views', 'PV_cantidad_imagenes', 'GA_quincena_ini', 'PV_idea_pca']], y,
             base_models = models2, ensemble_models = models3)

DecisionTreeRegressor5: 20.7408125732859
DecisionTreeRegressor10: 22.76637136023087
RandomForestRegressor20: 19.37295818019001
RandomForestRegressor100: 19.274800775039246
ExtraTreesRegressor50: 19.70540462712999
GradientBoostingRegressor30_md5: 19.329757443666548
XGB46: 19.378790817260736
XGB60: 19.70291030883788




nn250: 22.16388414528361
nn1250: 19.79379090364247
------------------------- Ensemble Time -------------------------
DecisionTreeRegressor2: 19.82707506402739
DecisionTreeRegressor5: 20.369154781279487
RandomForestRegressor10_md2: 18.853723324359773
RandomForestRegressor20: 18.951559826891277
RandomForestRegressor100: 19.251920933183342
ExtraTreesRegressor10: 20.576541832096698
ExtraTreesRegressor50: 19.560026681357165
GradientBoostingRegressor30_md5: 19.134232552717698
XGB46: 19.853542976379394
XGB60: 20.35949661254888
nn20: 19.14771009093324




nn10: 18.811937715852117
nn20_10: 19.167607236994737


In [108]:
models2 = {
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'DecisionTreeRegressor10':tree.DecisionTreeRegressor(max_depth = 10),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'RandomForestRegressor100':RandomForestRegressor(max_depth=10, n_estimators = 100, random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'XGB60':XGBRegressor(max_depth = 10, n_estimators=60, random_state=7),
    'nn250': MLPRegressor(hidden_layer_sizes = (X_norm.shape[1], 30, 20), max_iter = 250),
    'nn1250': MLPRegressor(hidden_layer_sizes = (X_norm.shape[1], 30, 20), max_iter = 1250),
    }
models3 = {
    'DecisionTreeRegressor2':tree.DecisionTreeRegressor(max_depth = 2),
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor10_md2':RandomForestRegressor(max_depth=2, n_estimators = 10, random_state=0),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'RandomForestRegressor100':RandomForestRegressor(max_depth=10, n_estimators = 100, random_state=0),
    'ExtraTreesRegressor10':ExtraTreesRegressor(n_estimators=10,random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'XGB60':XGBRegressor(max_depth = 10, n_estimators=60, random_state=7),
    'nn20': MLPRegressor(hidden_layer_sizes = (20, 20), max_iter = 250),
    'nn10': MLPRegressor(hidden_layer_sizes = (10, 10), max_iter = 250),
    'nn20_10': MLPRegressor(hidden_layer_sizes = (20, 10), max_iter = 250),
    }

FitEnsembles(X[['HY_precio', 'GA_exit_rate', 'PV_pca2', 'PV_longitud_descripcion', 
                'GA_page_views', 'PV_cantidad_imagenes', 'GA_quincena_ini', 'PV_idea_pca']], y,
             base_models = models2, ensemble_models = models3)

DecisionTreeRegressor5: 20.74081257328566
DecisionTreeRegressor10: 22.84212493082875
RandomForestRegressor20: 19.37295818019001
RandomForestRegressor100: 19.27734319222118
ExtraTreesRegressor50: 19.70540462712999
GradientBoostingRegressor30_md5: 19.329757443666548
XGB46: 19.44871681213377
XGB60: 19.71019577026368




nn250: 115.86614716345832
nn1250: 86.15780881508857
------------------------- Ensemble Time -------------------------
DecisionTreeRegressor2: 19.827075064027362
DecisionTreeRegressor5: 20.457182287274385
RandomForestRegressor10_md2: 18.9262305377024
RandomForestRegressor20: 19.070659166105422
RandomForestRegressor100: 19.6142712108636
ExtraTreesRegressor10: 21.47731879345462
ExtraTreesRegressor50: 20.305930263762324
GradientBoostingRegressor30_md5: 19.317487501614092
XGB46: 20.076716766357407
XGB60: 20.618359375000015
nn20: 20.867092899278482




nn10: 20.044461177632613
nn20_10: 19.69320788334054


In [122]:
models2 = {
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=10, n_estimators = 20, random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'nn250': MLPRegressor(hidden_layer_sizes = (30, 20), max_iter = 500),
    }
models3 = {
    'DecisionTreeRegressor2':tree.DecisionTreeRegressor(max_depth = 2),
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor10_md2':RandomForestRegressor(max_depth=2, n_estimators = 10, random_state=0),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'RandomForestRegressor100':RandomForestRegressor(max_depth=10, n_estimators = 100, random_state=0),
    'ExtraTreesRegressor10':ExtraTreesRegressor(n_estimators=10,random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0),
    'XGB15':XGBRegressor(max_depth = 10, n_estimators=15, random_state=7),
    'XGB30':XGBRegressor(max_depth = 10, n_estimators=30, random_state=7),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'XGB60':XGBRegressor(max_depth = 10, n_estimators=60, random_state=7),
    'nn20': MLPRegressor(hidden_layer_sizes = (20, 20), max_iter = 250),
    'nn10': MLPRegressor(hidden_layer_sizes = (10, 10), max_iter = 250),
    'nn20_10': MLPRegressor(hidden_layer_sizes = (20, 10), max_iter = 250),
    }

FitEnsembles(X[['HY_precio', 'GA_exit_rate', 'PV_pca2', 'PV_longitud_descripcion', 
                'GA_page_views', 'PV_cantidad_imagenes', 'GA_quincena_ini', 'PV_idea_pca']], y,
             base_models = models2, ensemble_models = models3)

DecisionTreeRegressor5: 20.7408125732861
RandomForestRegressor20: 19.55495022967462
ExtraTreesRegressor50: 19.70540462712999
GradientBoostingRegressor30_md5: 19.329757443666548
XGB46: 19.44871681213377
nn250: 67.7249999925248
------------------------- Ensemble Time -------------------------
DecisionTreeRegressor2: 19.97334996579884
DecisionTreeRegressor5: 19.752758681964856
RandomForestRegressor10_md2: 18.988389886828408
RandomForestRegressor20: 19.09491260528273
RandomForestRegressor100: 19.05518337297692
ExtraTreesRegressor10: 21.34307364141214
ExtraTreesRegressor50: 20.484818874927065
GradientBoostingRegressor30_md5: 19.37748004285087
XGB15: 27.738711643218988
XGB30: 18.768319168090816
XGB46: 19.82226776123052
XGB60: 20.386732978820817
nn20: 19.23867134466152




nn10: 19.50427516476138
nn20_10: 18.950444181205626


# Mezclar Forward y Ensemble

In [20]:
def FitEnsembles(X,y,base_models,ensemble_models):
    '''
    X,y --> nuestra data
    base_models --> Los modelos de los que se nutrirá nuestro ensemble
    ensemble_models --> Modelos que son la capa final del ensemble
    '''
    
    X_train_ensemble, X_test_ensemble, y_train_ensemble, y_test_ensemble = train_test_split(X, y, test_size=0.4, random_state=7)
    X_train, X_test, y_train, y_test = train_test_split(X_train_ensemble, y_train_ensemble, test_size=0.3, random_state=7)

    # Dataframe donde almacenaremos los resultados de nuestros modelos
    pred_ensemble_models = pd.DataFrame(data = np.zeros((X_test.shape[0],len(base_models))), 
                                        columns = list(base_models.keys()))
    # Dataframe donde almacenaremos los resultados de nuestros modelos
    pred_ensemble_models_test = pd.DataFrame(data = np.zeros((X_test_ensemble.shape[0],len(base_models))), 
                                    columns = list(base_models.keys()))
    
    # Procedemos a entrenar los base_models
    for name, model in base_models.items():
        base_models[name] = model.fit(X_train, y_train)
        y_test_predict = base_models[name].predict(X_test)
        # Almacenamos la predicción de cada modelo
        pred_ensemble_models[name] = y_test_predict
        pred_ensemble_models_test[name] = base_models[name].predict(X_test_ensemble)
        
        print(name,': ', median_absolute_error(np.exp(y_test_ensemble)-1, np.exp(pred_ensemble_models_test[name])-1), sep = '')
    
    print('-'*25+' Ensemble Time '+'-'*25)
    # Predict con el ensemble
    for name, model in ensemble_models.items():
        ensemble_models[name] = ensemble_models[name].fit(pred_ensemble_models, y_test)
        
        y_final_pred = ensemble_models[name].predict(pred_ensemble_models_test)
        print(name,': ', median_absolute_error(np.exp(y_test_ensemble)-1, np.exp(y_final_pred)-1), sep = '')
    
    # Devolvemos los df con los que entrenamos el ensemble 
    return pred_ensemble_models, pred_ensemble_models_test, y_test, y_test_ensemble


models2 = {
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=10, n_estimators = 20, random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'nn250': MLPRegressor(hidden_layer_sizes = (30, 20), max_iter = 500),
    }
models3 = {
    'DecisionTreeRegressor2':tree.DecisionTreeRegressor(max_depth = 2),
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor10_md2':RandomForestRegressor(max_depth=2, n_estimators = 10, random_state=0),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'XGB15':XGBRegressor(max_depth = 10, n_estimators=15, random_state=7),
    'XGB30':XGBRegressor(max_depth = 10, n_estimators=30, random_state=7),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    }

X_train_ensemble, X_test_ensemble, y_train_ensemble, y_test_ensemble = FitEnsembles(X.drop(['PV_idea_pca'], axis = 1),y,models2,models3)
errores = []
for name, model in models3.items():
    #try:
    print('*'*25+' {} '.format(name)+'*'*25)
    error = EntrenarForward(X_train_ensemble, y_train_ensemble, model, ['XGB46'])
    errores.append(error)

DecisionTreeRegressor5: 21.25391903041519
RandomForestRegressor20: 19.75468022879491
ExtraTreesRegressor50: 19.98785532323138
GradientBoostingRegressor30_md5: 19.687520697512582
XGB46: 19.215244140625003
nn250: 149.98646619881055
------------------------- Ensemble Time -------------------------
DecisionTreeRegressor2: 19.948260376261025
DecisionTreeRegressor5: 19.824308244576898
RandomForestRegressor10_md2: 19.099547831268765
RandomForestRegressor20: 18.99717835593423
XGB15: 27.9524559020996
XGB30: 19.206618118286137
XGB46: 20.099474716186496
************************* DecisionTreeRegressor2 *************************
Best var: GradientBoostingRegressor30_md5 --> 19.6769
Best var: DecisionTreeRegressor5 --> 19.6769
Best var:  --> 19.6769
Best vars: ['XGB46', 'GradientBoostingRegressor30_md5', 'DecisionTreeRegressor5']
Best error: 19.676861761167288
************************* DecisionTreeRegressor5 *************************
Best var: GradientBoostingRegressor30_md5 --> 19.9885
Best var: De

NameError: name 'models' is not defined

In [29]:
def FitEnsembles(X,y,base_models,ensemble_models):
    '''
    X,y --> nuestra data
    base_models --> Los modelos de los que se nutrirá nuestro ensemble
    ensemble_models --> Modelos que son la capa final del ensemble
    '''
    
    X_train_ensemble, X_test_ensemble, y_train_ensemble, y_test_ensemble = train_test_split(X, y, test_size=0.4, random_state=7)
    X_train, X_test, y_train, y_test = train_test_split(X_train_ensemble, y_train_ensemble, test_size=0.3, random_state=7)

    # Dataframe donde almacenaremos los resultados de nuestros modelos
    pred_ensemble_models = pd.DataFrame(data = np.zeros((X_test.shape[0],len(base_models))), 
                                        columns = list(base_models.keys()))
    # Dataframe donde almacenaremos los resultados de nuestros modelos
    pred_ensemble_models_test = pd.DataFrame(data = np.zeros((X_test_ensemble.shape[0],len(base_models))), 
                                    columns = list(base_models.keys()))
    
    # Procedemos a entrenar los base_models
    for name, model in base_models.items():
        base_models[name] = model.fit(X_train, y_train)
        y_test_predict = base_models[name].predict(X_test)
        # Almacenamos la predicción de cada modelo
        pred_ensemble_models[name] = y_test_predict
        pred_ensemble_models_test[name] = base_models[name].predict(X_test_ensemble)
        
        print(name,': ', median_absolute_error(np.exp(y_test_ensemble)-1, np.exp(pred_ensemble_models_test[name])-1), sep = '')
    
    print('-'*25+' Ensemble Time '+'-'*25)
    # Predict con el ensemble
    for name, model in ensemble_models.items():
        ensemble_models[name] = ensemble_models[name].fit(pred_ensemble_models, y_test)
        
        y_final_pred = ensemble_models[name].predict(pred_ensemble_models_test)
        print(name,': ', median_absolute_error(np.exp(y_test_ensemble)-1, np.exp(y_final_pred)-1), sep = '')
    
    # Devolvemos los df con los que entrenamos el ensemble 
    return pred_ensemble_models, pred_ensemble_models_test, y_test, y_test_ensemble


models2 = {
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=10, n_estimators = 20, random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'nn250': MLPRegressor(hidden_layer_sizes = (30, 20), max_iter = 500),
    }
models3 = {
    'DecisionTreeRegressor2':tree.DecisionTreeRegressor(max_depth = 2),
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor10_md2':RandomForestRegressor(max_depth=2, n_estimators = 10, random_state=0),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'XGB15':XGBRegressor(max_depth = 10, n_estimators=15, random_state=7),
    'XGB30':XGBRegressor(max_depth = 10, n_estimators=30, random_state=7),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    }

X_train_ensemble, X_test_ensemble, y_train_ensemble, y_test_ensemble = FitEnsembles(X_norm.drop(['PV_idea_pca'], axis = 1),y,models2,models3)
errores = []
for name, model in models3.items():
    #try:
    print('*'*25+' {} '.format(name)+'*'*25)
    error = EntrenarForward(X_train_ensemble, y_train_ensemble, model, ['XGB46'])
    errores.append(error)

DecisionTreeRegressor5: 21.25391903041517
RandomForestRegressor20: 19.75468022879491
ExtraTreesRegressor50: 19.98785532323138
GradientBoostingRegressor30_md5: 19.687520697512582
XGB46: 19.25421409606934
nn250: 20.411276825466455
------------------------- Ensemble Time -------------------------
DecisionTreeRegressor2: 19.94826037626116
DecisionTreeRegressor5: 19.636297634679536
RandomForestRegressor10_md2: 19.090839909539106
RandomForestRegressor20: 18.88948569582935
XGB15: 28.020206909179677
XGB30: 19.0473624420166
XGB46: 20.152078590393057
************************* DecisionTreeRegressor2 *************************
Best var: GradientBoostingRegressor30_md5 --> 19.6769
Best var: DecisionTreeRegressor5 --> 19.6769
Best var: nn250 --> 19.6769
Best var:  --> 19.6769
Best vars: ['XGB46', 'GradientBoostingRegressor30_md5', 'DecisionTreeRegressor5', 'nn250']
Best error: 19.676861761167345
************************* DecisionTreeRegressor5 *************************
Best var: GradientBoostingRegre

# Keras

In [32]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

In [52]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.4, random_state=7)

#estimator.fit(X_train, y_train)
model = Sequential()
model.add(Dense(13, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
#model.compile(loss='mean_squared_error', optimizer='adam')
#
#estimator = KerasRegressor(build_fn = model, epochs=100, batch_size=5, verbose=0)

model.compile(loss='mean_squared_error', optimizer='adam')
results = model.fit(X_train.values, y_train.values, batch_size = 30, epochs=200, verbose=0)

#print("Results: {:.2f} ({:.2f}) MSE".format(results.mean(), results.std()))

y_pred = model.predict(X_test.values)

median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)

18.384207229614272

In [65]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

X_train, X_test, y_train, y_test = train_test_split(X_norm[['HY_precio', 'GA_exit_rate', 'PV_pca2', 'PV_longitud_descripcion', 
                                                            'GA_page_views', 'PV_cantidad_imagenes', 'GA_quincena_ini', 'PV_idea_pca']], y, 
                                                    test_size=0.3, random_state=7)

#estimator.fit(X_train, y_train)
model = Sequential()
model.add(Dense(13, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
#model.compile(loss='mean_squared_error', optimizer='adam')
#
#estimator = KerasRegressor(build_fn = model, epochs=100, batch_size=5, verbose=0)

model.compile(loss='mean_squared_error', optimizer='adam')
results = model.fit(X_train.values, y_train.values, batch_size = 30, epochs=150, verbose=0)

#print("Results: {:.2f} ({:.2f}) MSE".format(results.mean(), results.std()))

y_pred = model.predict(X_test.values)

median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)

18.46817947387695

In [69]:
for i in [50, 100, 150, 200]:
    for j in [10, 15, 30, 50]:
        X_train, X_test, y_train, y_test = train_test_split(X_norm[['HY_precio', 'GA_exit_rate', 'PV_pca2', 'PV_longitud_descripcion', 
                                                            'GA_page_views', 'PV_cantidad_imagenes', 'GA_quincena_ini', 'PV_idea_pca']], y, 
                                                    test_size=0.3, random_state=7)

        #estimator.fit(X_train, y_train)
        model = Sequential()
        model.add(Dense(13, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
        model.add(Dense(1, kernel_initializer='normal'))
        # Compile model
        #model.compile(loss='mean_squared_error', optimizer='adam')
        #
        #estimator = KerasRegressor(build_fn = model, epochs=100, batch_size=5, verbose=0)
        
        model.compile(loss='mean_squared_error', optimizer='adam')
        results = model.fit(X_train.values, y_train.values, batch_size = j, epochs=i, verbose=0)
        
        y_pred = model.predict(X_test.values)
        
        print('{} eps, {} bs --> {:.4f}'.format(i,j,median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)))

50 eps, 10 bs --> 23.3948
50 eps, 15 bs --> 19.9649
50 eps, 30 bs --> 19.3209
50 eps, 50 bs --> 23.0016
100 eps, 10 bs --> 21.3169
100 eps, 15 bs --> 21.5905
100 eps, 30 bs --> 19.3479
100 eps, 50 bs --> 19.6670
150 eps, 10 bs --> 22.3069
150 eps, 15 bs --> 18.9892
150 eps, 30 bs --> 21.3619
150 eps, 50 bs --> 19.8038
200 eps, 10 bs --> 19.8631
200 eps, 15 bs --> 18.7639
200 eps, 30 bs --> 21.5385
200 eps, 50 bs --> 18.6925


In [71]:
for i in [200, 300, 400, 500]:
    for j in [10, 15, 20]:
        X_train, X_test, y_train, y_test = train_test_split(X_norm[['HY_precio', 'GA_exit_rate', 'PV_pca2', 'PV_longitud_descripcion', 
                                                            'GA_page_views', 'PV_cantidad_imagenes', 'GA_quincena_ini', 'PV_idea_pca']], y, 
                                                    test_size=0.3, random_state=7)

        #estimator.fit(X_train, y_train)
        model = Sequential()
        model.add(Dense(j, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))
        model.add(Dense(1, kernel_initializer='normal'))
        # Compile model
        #model.compile(loss='mean_squared_error', optimizer='adam')
        #
        #estimator = KerasRegressor(build_fn = model, epochs=100, batch_size=5, verbose=0)
        
        model.compile(loss='mean_squared_error', optimizer='adam')
        results = model.fit(X_train.values, y_train.values, batch_size = 50, epochs=i, verbose=0)
        
        y_pred = model.predict(X_test.values)
        
        print('{} eps, {} lay --> {:.4f}'.format(i,j,median_absolute_error(np.exp(y_test)-1, np.exp(y_pred)-1)))

200 eps, 10 lay --> 19.2715
200 eps, 15 lay --> 18.9075
200 eps, 20 lay --> 18.6282
300 eps, 10 lay --> 19.7295
300 eps, 15 lay --> 18.9537
300 eps, 20 lay --> 21.7513
400 eps, 10 lay --> 23.0959
400 eps, 15 lay --> 19.3344
400 eps, 20 lay --> 18.0148
500 eps, 10 lay --> 22.5386
500 eps, 15 lay --> 18.2996
500 eps, 20 lay --> 18.9575


# Todo junto:

In [None]:
def FitEnsembles(X,y,base_models,ensemble_models):
    '''
    X,y --> nuestra data
    base_models --> Los modelos de los que se nutrirá nuestro ensemble
    ensemble_models --> Modelos que son la capa final del ensemble
    '''
    
    X_train_ensemble, X_test_ensemble, y_train_ensemble, y_test_ensemble = train_test_split(X, y, test_size=0.4, random_state=7)
    X_train, X_test, y_train, y_test = train_test_split(X_train_ensemble, y_train_ensemble, test_size=0.3, random_state=7)

    # Dataframe donde almacenaremos los resultados de nuestros modelos
    pred_ensemble_models = pd.DataFrame(data = np.zeros((X_test.shape[0],len(base_models))), 
                                        columns = list(base_models.keys()))
    # Dataframe donde almacenaremos los resultados de nuestros modelos
    pred_ensemble_models_test = pd.DataFrame(data = np.zeros((X_test_ensemble.shape[0],len(base_models))), 
                                    columns = list(base_models.keys()))
    
    # Procedemos a entrenar los base_models
    for name, model in base_models.items():
        base_models[name] = model.fit(X_train, y_train)
        y_test_predict = base_models[name].predict(X_test)
        # Almacenamos la predicción de cada modelo
        pred_ensemble_models[name] = y_test_predict
        pred_ensemble_models_test[name] = base_models[name].predict(X_test_ensemble)
        
        print(name,': ', median_absolute_error(np.exp(y_test_ensemble)-1, np.exp(pred_ensemble_models_test[name])-1), sep = '')
    
    print('-'*25+' Ensemble Time '+'-'*25)
    # Predict con el ensemble
    for name, model in ensemble_models.items():
        ensemble_models[name] = ensemble_models[name].fit(pred_ensemble_models, y_test)
        
        y_final_pred = ensemble_models[name].predict(pred_ensemble_models_test)
        print(name,': ', median_absolute_error(np.exp(y_test_ensemble)-1, np.exp(y_final_pred)-1), sep = '')
    
    # Devolvemos los df con los que entrenamos el ensemble 
    return pred_ensemble_models, pred_ensemble_models_test, y_test, y_test_ensemble


models2 = {
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=10, n_estimators = 20, random_state=0),
    'ExtraTreesRegressor50':ExtraTreesRegressor(n_estimators=50,random_state=0),
    'GradientBoostingRegressor30_md5':GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, max_depth=5, random_state=0, loss='ls'),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    'nn250': MLPRegressor(hidden_layer_sizes = (30, 20), max_iter = 500),
    }
models3 = {
    'DecisionTreeRegressor2':tree.DecisionTreeRegressor(max_depth = 2),
    'DecisionTreeRegressor5':tree.DecisionTreeRegressor(max_depth = 5),
    'RandomForestRegressor10_md2':RandomForestRegressor(max_depth=2, n_estimators = 10, random_state=0),
    'RandomForestRegressor20':RandomForestRegressor(max_depth=5, n_estimators = 20, random_state=0),
    'XGB15':XGBRegressor(max_depth = 10, n_estimators=15, random_state=7),
    'XGB30':XGBRegressor(max_depth = 10, n_estimators=30, random_state=7),
    'XGB46':XGBRegressor(max_depth = 10, n_estimators=46, random_state=7),
    }

X_train_ensemble, X_test_ensemble, y_train_ensemble, y_test_ensemble = FitEnsembles(X_norm.drop(['PV_idea_pca'], axis = 1),y,models2,models3)
errores = []
for name, model in models3.items():
    #try:
    print('*'*25+' {} '.format(name)+'*'*25)
    error = EntrenarForward(X_train_ensemble, y_train_ensemble, model, ['XGB46'])
    errores.append(error)