# ML-box Avocado

#### Goal :

- Create a ML model using Auto-sklearn for the Avocado dataset
- Get RMSE over the predictions of these model

#### Imports

In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics
from math import sqrt
from mlbox.preprocessing import Reader
from mlbox.preprocessing import Drift_thresholder
from mlbox.optimisation import make_scorer
from mlbox.optimisation import Optimiser
from mlbox.prediction import Predictor


reading csv : train.csv ...
cleaning data ...
CPU time: 0.6825635433197021 seconds

reading csv : test.csv ...
cleaning data ...
CPU time: 0.45384645462036133 seconds

> Number of common features : 80

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 43
> Number of numerical features: 37
> Number of training samples : 1460
> Number of test samples : 1459

> Top sparse features (% missing values on train set):
PoolQC         99.5
MiscFeature    96.3
Alley          93.8
Fence          80.8
FireplaceQu    47.3
dtype: float64

> Task : regression
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

computing drifts ...
CPU time: 3.251035213470459 seconds

> Top 10 

  +str(self.to_path)+"/joblib'. Please clear it regularly.")


Paths to the train set and the test set.

In [None]:
paths = ["../../Data/California_house/train.csv", "../../Data/California_house/test.csv"]

Name of the feature to predict.
This columns should only be present in the train set.

In [None]:
target_name = "SalePrice"

Reading and cleaning all files.
Declare a reader for csv files.

In [None]:
rd = Reader(sep=',')

Return a dictionnary containing three entries:
- dict["train"] contains training samples withtout target columns
- dict["test"] contains testing elements withtout target columns
- dict["target"] contains target columns for training samples.

In [None]:
data = rd.train_test_split(paths, target_name)

Removing the drifting variables

In [None]:
dft = Drift_thresholder()
data = dft.fit_transform(data)

Tuning

In [None]:
mape = make_scorer(lambda y_true,
                   y_pred: 100*np.sum(
                                      np.abs(y_true-y_pred)/y_true
                                      )/len(y_true),
                   greater_is_better=False,
                   needs_proba=False)

Declare an optimiser

In [None]:
opt = Optimiser(scoring="mean_squared_error", n_folds=3)

Space of hyperparameters

In [None]:
space = {
        'ne__numerical_strategy': {"search": "choice",
                                   "space": [0]},
        'ce__strategy': {"search": "choice",
                         "space": ["label_encoding",
                                   "random_projection",
                                   "entity_embedding"]},
        'fs__threshold': {"search": "uniform",
                          "space": [0.01, 0.3]},
        'est__max_depth': {"search": "choice",
                           "space": [3, 4, 5, 6, 7]}

        }

Optimises hyper-parameters of the whole Pipeline

In [2]:
best = opt.optimise(space,data,40)
print("Final results : " ,opt.evaluate(best, data))

##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}
>>> CA ENCODER :{'strategy': 'entity_embedding'}      
  0%|          | 0/40 [00:00<?, ?trial/s, best loss=?]




>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.26606117093345866}
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 4, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 0.9, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}
  0%|          | 0/40 [00:00<?, ?trial/s, best loss=?]

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -900921299.7172343
VARIANCE : 351178789.0767834 (fold 1 = -721565509.653147, fold 2 = -1391677040.4027362, fold 3 = -589521349.09582)
CPU time: 7.061063766479492 seconds                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'entity_embedding'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.1480372523094022}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': Tr

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -929784024.3913746                       
VARIANCE : 359521066.6101933 (fold 1 = -742074778.6912899, fold 2 = -1432853780.5317507, fold 3 = -614423513.9510834)
CPU time: 7.217243909835815 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'label_encoding'}                                 
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.29711275773087137}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 7, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -946277053.9861931                       
VARIANCE : 356419356.9826308 (fold 1 = -736344872.3220425, fold 2 = -1448103947.0829484, fold 3 = -654382342.5535882)
CPU time: 3.087646722793579 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'random_projection'}                              
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.050325991019995894}    
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -954204472.4954271                       
VARIANCE : 383246483.3441557 (fold 1 = -782966593.7989023, fold 2 = -1485160354.1336021, fold 3 = -594486469.5537766)
CPU time: 3.944675922393799 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'random_projection'}                              
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.07469221436306847}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -874419386.3908612                       
VARIANCE : 330204400.4533249 (fold 1 = -697455324.019692, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 3.1102139949798584 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'label_encoding'}                                 
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.15740787646485827}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None,

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -961708912.098619                        
VARIANCE : 345039728.906723 (fold 1 = -807653193.6355202, fold 2 = -1439709025.3486369, fold 3 = -637764517.3116999)
CPU time: 2.602216958999634 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'random_projection'}                              
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.028310101807272482}    
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None,

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -916187019.8744954                       
VARIANCE : 392826589.9215118 (fold 1 = -725496476.0481617, fold 2 = -1463413838.208246, fold 3 = -559650745.3670782)
CPU time: 3.5917434692382812 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'random_projection'}                              
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2677340464828649}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None,

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -897512046.3394762                       
VARIANCE : 344213510.95665497 (fold 1 = -679284911.0329605, fold 2 = -1383463907.892909, fold 3 = -629787320.0925592)
CPU time: 3.510580062866211 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'entity_embedding'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.13524280588388077}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -929784024.3913746                       
VARIANCE : 359521066.6101933 (fold 1 = -742074778.6912899, fold 2 = -1432853780.5317507, fold 3 = -614423513.9510834)
CPU time: 7.537013053894043 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'random_projection'}                              
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.04947548732205739}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -916187019.8744954                       
VARIANCE : 392826589.9215118 (fold 1 = -725496476.0481617, fold 2 = -1463413838.208246, fold 3 = -559650745.3670782)
CPU time: 3.6943068504333496 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2847986386021459}       
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 4, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': No

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -900921299.7172343                        
VARIANCE : 351178789.0767834 (fold 1 = -721565509.653147, fold 2 = -1391677040.4027362, fold 3 = -589521349.09582)
CPU time: 6.094707489013672 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.25497399104906887}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': No

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -902426854.8672428                        
VARIANCE : 325806186.53757685 (fold 1 = -717991374.46762, fold 2 = -1360311344.9316785, fold 3 = -628977845.2024298)
CPU time: 4.012228965759277 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2589815062579321}       
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 4, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -897577328.7536517                        
VARIANCE : 344443411.6038728 (fold 1 = -669875554.4804608, fold 2 = -1384356925.1506138, fold 3 = -638499506.6298808)
CPU time: 3.2228384017944336 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.14418297349119114}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -929784024.3913746                        
VARIANCE : 359521066.6101933 (fold 1 = -742074778.6912899, fold 2 = -1432853780.5317507, fold 3 = -614423513.9510834)
CPU time: 6.910539150238037 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'label_encoding'}                                  
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.011238900907248293}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -930948453.9165632                        
VARIANCE : 346560277.219052 (fold 1 = -751869205.2029922, fold 2 = -1415588072.7422888, fold 3 = -625388083.8044087)
CPU time: 2.394495725631714 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'label_encoding'}                                  
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2674677442263298}       
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -907848119.0853157                        
VARIANCE : 370271771.7185044 (fold 1 = -660477732.8425618, fold 2 = -1431230317.955952, fold 3 = -631836306.4574333)
CPU time: 2.726121664047241 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.12334825059700354}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -919276760.6136769                        
VARIANCE : 387673670.20621836 (fold 1 = -705280955.9251897, fold 2 = -1463413838.208246, fold 3 = -589135487.7075949)
CPU time: 4.600215196609497 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.06185459301887354}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -874419386.3908612                        
VARIANCE : 330204400.4533249 (fold 1 = -697455324.019692, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 3.4635932445526123 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.09127676922579481}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 7, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -928663172.455616                         
VARIANCE : 359776672.0740406 (fold 1 = -736839967.0900067, fold 2 = -1432694578.707734, fold 3 = -616454971.5691069)
CPU time: 7.56917667388916 seconds                                              
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.0594759138027328}       
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 4, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -890525246.4956702                        
VARIANCE : 372619810.93857557 (fold 1 = -672115534.6526834, fold 2 = -1415050681.7114568, fold 3 = -584409523.1228703)
CPU time: 3.656399965286255 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.18420457354294428}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state'

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -917910606.360064                         
VARIANCE : 348026620.0829641 (fold 1 = -753337293.4593374, fold 2 = -1401906748.5587943, fold 3 = -598487777.0620605)
CPU time: 3.0743813514709473 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.09357361025472845}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -874419386.3908612                        
VARIANCE : 330204400.4533249 (fold 1 = -697455324.019692, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 3.100966691970825 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.10271650679151925}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -864638297.3940634                        
VARIANCE : 335690464.5193856 (fold 1 = -668112057.0292984, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 4.221695899963379 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.18403902223462454}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -917910606.360064                         
VARIANCE : 348026620.0829641 (fold 1 = -753337293.4593374, fold 2 = -1401906748.5587943, fold 3 = -598487777.0620605)
CPU time: 4.055310487747192 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.10261056775241012}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -864638297.3940634                        
VARIANCE : 335690464.5193856 (fold 1 = -668112057.0292984, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 4.2435383796691895 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.10866269706225014}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -864638297.3940634                        
VARIANCE : 335690464.5193856 (fold 1 = -668112057.0292984, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 3.7854788303375244 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'label_encoding'}                                  
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2239180498372774}       
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -915231660.497424                         
VARIANCE : 315946675.98933494 (fold 1 = -761051649.0946233, fold 2 = -1355508733.9283056, fold 3 = -629134598.4693428)
CPU time: 3.173470973968506 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.17636545964340095}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state'

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -913074882.8965921                        
VARIANCE : 355163505.13186413 (fold 1 = -743989842.7802062, fold 2 = -1407213984.9387634, fold 3 = -588020820.9708073)
CPU time: 4.14545750617981 seconds                                              
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.11713146865158974}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 7, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state'

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -943266536.8510228                        
VARIANCE : 388412677.4061792 (fold 1 = -737838407.9632446, fold 2 = -1487167721.2916372, fold 3 = -604793481.2981869)
CPU time: 6.339411020278931 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.212059542334262}        
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -892901248.0329529                        
VARIANCE : 340257955.1263336 (fold 1 = -702527904.1844956, fold 2 = -1370817206.1427293, fold 3 = -605358633.7716339)
CPU time: 3.0586273670196533 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.028065320209961164}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -880882880.8986775                        
VARIANCE : 301072630.0578903 (fold 1 = -724285648.8936479, fold 2 = -1302073714.735361, fold 3 = -616289279.0670233)
CPU time: 6.75621771812439 seconds                                              
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'label_encoding'}                                  
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.16282239566997672}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 7, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -958044167.9685785                        
VARIANCE : 370060326.4127969 (fold 1 = -773581666.6705997, fold 2 = -1474418268.3607748, fold 3 = -626132568.874361)
CPU time: 3.5847654342651367 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.07870015307245311}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -874419386.3908612                        
VARIANCE : 330204400.4533249 (fold 1 = -697455324.019692, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 3.574638843536377 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.12453292872159985}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 4, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -922830350.6026272                        
VARIANCE : 370549184.0450439 (fold 1 = -691510923.7520396, fold 2 = -1445710567.8901508, fold 3 = -631269560.1656907)
CPU time: 3.755495309829712 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'label_encoding'}                                  
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.11276366100973129}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -924266115.9668522                        
VARIANCE : 337359868.14320004 (fold 1 = -735245987.3309923, fold 2 = -1398145478.1704416, fold 3 = -639406882.3991227)
CPU time: 2.5076904296875 seconds                                               
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.20631755256958473}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 7, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state'

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -928663172.455616                         
VARIANCE : 359776672.0740406 (fold 1 = -736839967.0900067, fold 2 = -1432694578.707734, fold 3 = -616454971.5691069)
CPU time: 7.731536865234375 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.036613030622680384}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -874419386.3908612                        
VARIANCE : 330204400.4533249 (fold 1 = -697455324.019692, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 3.506225109100342 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.010163572651046754}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -954204472.4954271                        
VARIANCE : 383246483.3441557 (fold 1 = -782966593.7989023, fold 2 = -1485160354.1336021, fold 3 = -594486469.5537766)
CPU time: 4.929996490478516 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'random_projection'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.08213538751934721}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state':

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -874419386.3908612                        
VARIANCE : 330204400.4533249 (fold 1 = -697455324.019692, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 3.8954994678497314 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}     
>>> CA ENCODER :{'strategy': 'label_encoding'}                                  
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.13537083446910095}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 4, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 

  positive)

  positive)

  positive)



MEAN SCORE : neg_mean_squared_error = -941211715.9882464                        
VARIANCE : 358309396.5112269 (fold 1 = -733572359.6411499, fold 2 = -1445334852.4458594, fold 3 = -644727935.8777298)
CPU time: 3.1373825073242188 seconds                                            
100%|██████████| 40/40 [02:54<00:00,  4.36s/trial, best loss: 864638297.3940634]


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BEST HYPER-PARAMETERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

{'ce__strategy': 'random_projection', 'est__max_depth': 3, 'fs__threshold': 0.10271650679151925, 'ne__numerical_strategy': 0}

##################################################### testing hyper-parameters... #############

  positive)
  positive)
  positive)



MEAN SCORE : neg_mean_squared_error = -864638297.3940634
VARIANCE : 335690464.5193856 (fold 1 = -668112057.0292984, fold 2 = -1337154286.0890722, fold 3 = -588648549.0638192)
CPU time: 3.86971378326416 seconds

Final results :  -864638297.3940634


Make a prediction

In [None]:
from mlbox.prediction import *
pred=Predictor()
pred.fit_predict(best,data)

Getting the predictions and targets

In [4]:
y_pred = pd.read_csv("save/SalePrice_predictions.csv")
predictions = y_pred.AveragePrice_predicted
y_test = pd.read_csv("../../Data/California_house/processed/y_test.csv")

Unnamed: 0.1,Unnamed: 0,SalePrice_predicted
count,1459.0,1459.0
mean,729.0,179022.867991
std,421.321334,77879.341567
min,0.0,43400.770114
25%,364.5,128148.813415
50%,729.0,158642.079861
75%,1093.5,211934.41503
max,1458.0,519868.288239


Calculating RMSE

In [5]:
from sklearn.metrics import mean_squared_error
from math import sqrt
print("rmse score:", sqrt(mean_squared_error(y_test, predictions)))

In [11]:
from sklearn.metrics import mean_squared_error
from math import sqrt
print("rmse score:", sqrt(mean_squared_error(y_test, predictions)))

rmse score: 110545.05648529054


## Residual Plot

In [16]:
import matplotlib.pyplot as plt

plt.scatter(predictions, score, c="grey", label="Testing Data")
plt.legend()
plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
plt.title("Residual Plot")
plt.show()