In [None]:
#Classification
from mlbox.preprocessing import Reader
from mlbox.preprocessing import Drift_thresholder
from mlbox.optimisation import Optimiser
from mlbox.prediction import Predictor
import csv 


# Paths to the train set and the test set.
paths = ["../Data/titanic/train.csv","../Data/titanic/test.csv"]
# Name of the feature to predict.
# This columns should only be present in the train set.
target_name = "Survived"

# Reading and cleaning all files
# Declare a reader for csv files
rd = Reader(sep=',')
# Return a dictionnary containing three entries
# dict["train"] contains training samples withtout target columns
# dict["test"] contains testing elements withtout target columns
# dict["target"] contains target columns for training samples.
data = rd.train_test_split(paths, target_name)

dft = Drift_thresholder()
data = dft.fit_transform(data)

# Tuning
# Declare an optimiser. Scoring possibilities for classification lie in :
# {"accuracy", "roc_auc", "f1", "neg_log_loss", "precision", "recall"}
opt = Optimiser(scoring='accuracy', n_folds=3)
opt.evaluate(None, data)

# Space of hyperparameters
# The keys must respect the following syntax : "enc__param".
#   "enc" = "ne" for na encoder
#   "enc" = "ce" for categorical encoder
#   "enc" = "fs" for feature selector [OPTIONAL]
#   "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
#   "enc" = "est" for the final estimator
#   "param" : a correct associated parameter for each step.
#   Ex: "max_depth" for "enc"="est", ...
# The values must respect the syntax: {"search":strategy,"space":list}
#   "strategy" = "choice" or "uniform". Default = "choice"
#   list : a list of values to be tested if strategy="choice".
#   Else, list = [value_min, value_max].
# Available strategies for ne_numerical_strategy are either an integer, a float
#   or in {'mean', 'median', "most_frequent"}
# Available strategies for ce_strategy are:
#   {"label_encoding", "dummification", "random_projection", entity_embedding"}
space = {'ne__numerical_strategy': {"search": "choice", "space": [0]},
         'ce__strategy': {"search": "choice",
                          "space": ["label_encoding",
                                    "random_projection",
                                    "entity_embedding"]},
         'fs__threshold': {"search": "uniform",
                           "space": [0.01, 0.3]},
         'est__max_depth': {"search": "choice",
                            "space": [3, 4, 5, 6, 7]}

         }

# Optimises hyper-parameters of the whole Pipeline with a given scoring
# function. Algorithm used to optimize : Tree Parzen Estimator.
#
# IMPORTANT : Try to avoid dependent parameters and to set one feature
# selection strategy and one estimator strategy at a time.
best = opt.optimise(space, data, 15)

# Make prediction and save the results in save folder.
prd = Predictor()
prd.fit_predict(best, data)


reading csv : high_diamond_ranked_10min.csv1.csv ...
cleaning data ...
CPU time: 0.3347346782684326 seconds

reading csv : high_diamond_ranked_10min.csv2.csv ...
cleaning data ...
CPU time: 0.11321568489074707 seconds

You have no test dataset !

> Number of common features : 39

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 0
> Number of numerical features: 39
> Number of training samples : 9879
> Number of test samples : 0

> You have no missing values on train set...

> Task : classification
0.0    4949
1.0    4930
Name: blueWins, dtype: int64

encoding target ...

You have no test dataset...
No parameters set. Default configuration is tested

##################################################### testing hyper-parameters... #####################################################

>>> NA ENCODER :{'numerical_strategy':

  +str(self.to_path)+"/joblib'. Please clear it regularly.")



MEAN SCORE : accuracy = 0.7162668286263791
VARIANCE : 0.008467866131537325 (fold 1 = 0.7139386577588824, fold 2 = 0.7072578196173702, fold 3 = 0.7276040085028849)
CPU time: 3.362366199493408 seconds

##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}                                            
>>> CA ENCODER :{'strategy': 'random_projection'}                                                                      
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.22889813189704142}                                             
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 

CPU time: 3.9055869579315186 seconds                                                                                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}                                            
>>> CA ENCODER :{'strategy': 'entity_embedding'}                                                                       
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.14278606264224958}                                             
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0,



CPU time: 1.6390135288238525 seconds


In [15]:
#Regression
import numpy as np
import pandas as pd
import sklearn.metrics
from math import sqrt
from mlbox.preprocessing import Reader
from mlbox.preprocessing import Drift_thresholder
from mlbox.optimisation import make_scorer
from mlbox.optimisation import Optimiser
from mlbox.prediction import Predictor

# Paths to the train set and the test set.
paths = ["../Data/California_house/train.csv", "../Data/California_house/test.csv"]
# Name of the feature to predict.
# This columns should only be present in the train set.
target_name = "SalePrice"

# Reading and cleaning all files
# Declare a reader for csv files
rd = Reader(sep=',')
# Return a dictionnary containing three entries
# dict["train"] contains training samples withtout target columns
# dict["test"] contains testing elements withtout target columns
# dict["target"] contains target columns for training samples.
data = rd.train_test_split(paths, target_name)

dft = Drift_thresholder()
data = dft.fit_transform(data)

# Tuning
mape = make_scorer(lambda y_true,
                   y_pred: 100*np.sum(
                                      np.abs(y_true-y_pred)/y_true
                                      )/len(y_true),
                   greater_is_better=False,
                   needs_proba=False)
# Declare an optimiser. You can declare your own score
# as presented here or use one in
# {"neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error","r2"}
opt = Optimiser(scoring="mean_squared_error", n_folds=3)
print("OUIIII" ,opt.evaluate(None, data))

# Space of hyperparameters
# The keys must respect the following syntax : "enc__param".
#   "enc" = "ne" for na encoder
#   "enc" = "ce" for categorical encoder
#   "enc" = "fs" for feature selector [OPTIONAL]
#   "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
#   "enc" = "est" for the final estimator
#   "param" : a correct associated parameter for each step.
#   Ex: "max_depth" for "enc"="est", ...
# The values must respect the syntax: {"search":strategy,"space":list}
#   "strategy" = "choice" or "uniform". Default = "choice"
#   list : a list of values to be tested if strategy="choice".
#   Else, list = [value_min, value_max].
# Available strategies for ne_numerical_strategy are either an integer, a float
#   or in {'mean', 'median', "most_frequent"}
# Available strategies for ce_strategy are:
#   {"label_encoding", "dummification", "random_projection", entity_embedding"}
space = {
        'ne__numerical_strategy': {"search": "choice",
                                   "space": [0]},
        'ce__strategy': {"search": "choice",
                         "space": ["label_encoding",
                                   "random_projection",
                                   "entity_embedding"]},
        'fs__threshold': {"search": "uniform",
                          "space": [0.01, 0.3]},
        'est__max_depth': {"search": "choice",
                           "space": [3, 4, 5, 6, 7]}

        }

# Optimises hyper-parameters of the whole Pipeline with a given scoring
# function. Algorithm used to optimize : Tree Parzen Estimator.
#
# IMPORTANT : Try to avoid dependent parameters and to set one feature
# selection strategy and one estimator strategy at a time.
best = opt.optimise(space, data, 15)
print("oui : ",opt.optimisation.Optimiser("mean_squared_error").optimise(space, data, 15)))
print(best)




reading csv : train.csv ...
cleaning data ...
CPU time: 0.6609208583831787 seconds

reading csv : test.csv ...
cleaning data ...
CPU time: 0.5561227798461914 seconds

> Number of common features : 80

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 43
> Number of numerical features: 37
> Number of training samples : 1460
> Number of test samples : 1459

> Top sparse features (% missing values on train set):
PoolQC         99.5
MiscFeature    96.3
Alley          93.8
Fence          80.8
FireplaceQu    47.3
dtype: float64

> Task : regression
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

computing drifts ...
CPU time: 4.514478921890259 seconds

> Top 10 d

  +str(self.to_path)+"/joblib'. Please clear it regularly.")



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.724280495125964
VARIANCE : 0.8209723498000103 (fold 1 = -10.46930111453782, fold 2 = -10.122939172213853, fold 3 = -8.580601198626216)
CPU time: 10.2386474609375 seconds

##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}
>>> CA ENCODER :{'strategy': 'random_projection'}     
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.05172919388966048}
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 0.9, 's

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.16678475900378
VARIANCE : 0.8417502919612839 (fold 1 = -10.04577078627961, fold 2 = -9.42252767935932, fold 3 = -8.03205581137241)
CPU time: 4.288841009140015 seconds                   
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}   
>>> CA ENCODER :{'strategy': 'random_projection'}                             
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2505993630769116}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambd

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.776082866600623
VARIANCE : 0.7516477608352751 (fold 1 = -10.47311964228327, fold 2 = -10.122594215218452, fold 3 = -8.732534742300148)
CPU time: 4.583108186721802 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}   
>>> CA ENCODER :{'strategy': 'entity_embedding'}                              
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.23337067946461196}    
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.291406026526055
VARIANCE : 0.7083677015222534 (fold 1 = -9.819690877823804, fold 2 = -9.764395568544344, fold 3 = -8.290131633210015)
CPU time: 11.250384330749512 seconds                                          
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}   
>>> CA ENCODER :{'strategy': 'label_encoding'}                                
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.08964778899592642}    
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 4, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, '

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.52563752483587
VARIANCE : 0.7807159274768404 (fold 1 = -10.049406840477472, fold 2 = -10.105492326181789, fold 3 = -8.42201340784835)
CPU time: 5.466446161270142 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}   
>>> CA ENCODER :{'strategy': 'random_projection'}                             
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.14349204460034615}    
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 7, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, '

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.618391243024625
VARIANCE : 0.8095396182204544 (fold 1 = -10.268823623461811, fold 2 = -10.109100833169597, fold 3 = -8.477249272442469)
CPU time: 7.311812162399292 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}   
>>> CA ENCODER :{'strategy': 'label_encoding'}                                
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.20292900230097088}    
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 7, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None,

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.900488804722508
VARIANCE : 0.7775651375657084 (fold 1 = -10.486579256087197, fold 2 = -10.413225684730481, fold 3 = -8.801661473349847)
CPU time: 4.468336343765259 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}   
>>> CA ENCODER :{'strategy': 'random_projection'}                             
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.21069630821156415}    
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 4, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None,

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.636978624867487
VARIANCE : 0.7768116198081974 (fold 1 = -10.398918206131897, fold 2 = -9.941385004583006, fold 3 = -8.570632663887556)
CPU time: 4.361413240432739 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}   
>>> CA ENCODER :{'strategy': 'label_encoding'}                                
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.031485025338856554}   
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.4660029699925
VARIANCE : 0.7066664330333843 (fold 1 = -10.074897278352553, fold 2 = -9.847851642915652, fold 3 = -8.475259988709292)
CPU time: 4.726389408111572 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}   
>>> CA ENCODER :{'strategy': 'random_projection'}                             
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2918933925041057}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'r

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.937416594497549
VARIANCE : 0.5684315713303486 (fold 1 = -10.578637795708385, fold 2 = -10.036684792196992, fold 3 = -9.196927195587271)
CPU time: 6.10563063621521 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}   
>>> CA ENCODER :{'strategy': 'random_projection'}                             
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.18543968823843288}    
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 7, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None,

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.667853269039943
VARIANCE : 0.9530074670860331 (fold 1 = -10.357424386818751, fold 2 = -10.325915457132387, fold 3 = -8.320219963168691)
CPU time: 7.382337331771851 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'label_encoding'}                                 
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.05912640920756293}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 3, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': No

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.35323422801509
VARIANCE : 0.8512341584675703 (fold 1 = -10.261450323408946, fold 2 = -9.583419269471955, fold 3 = -8.214833091164367)
CPU time: 3.946617603302002 seconds                                            
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'label_encoding'}                                 
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.17679926090749812}     
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': Non

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.552104951482155
VARIANCE : 0.7620571993226769 (fold 1 = -10.324820807651003, fold 2 = -9.816346298511231, fold 3 = -8.515147748284226)
CPU time: 4.32455039024353 seconds                                             
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'entity_embedding'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2892114931488397}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 5, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': No

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.250951463299302
VARIANCE : 0.6990484697913713 (fold 1 = -9.76886851767626, fold 2 = -9.721255980742596, fold 3 = -8.262729891479045)
CPU time: 9.8928701877594 seconds                                              
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'entity_embedding'}                               
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.045470207038264356}    
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 7, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.356842165606485
VARIANCE : 0.7687203093975364 (fold 1 = -9.924057662762314, fold 2 = -9.87641329247293, fold 3 = -8.270055541584211)
CPU time: 10.259782314300537 seconds                                           
##################################################### testing hyper-parameters... #####################################################
>>> NA ENCODER :{'numerical_strategy': 0, 'categorical_strategy': '<NULL>'}    
>>> CA ENCODER :{'strategy': 'label_encoding'}                                 
>>> FEATURE SELECTOR :{'strategy': 'l1', 'threshold': 0.2567090768846798}      
>>> ESTIMATOR :{'strategy': 'LightGBM', 'max_depth': 6, 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None

  positive)

  positive)

  positive)



MEAN SCORE : make_scorer(<lambda>, greater_is_better=False) = -9.926518770381298
VARIANCE : 0.6215658598602214 (fold 1 = -10.519801832360304, fold 2 = -10.191595659503042, fold 3 = -9.068158819280548)
CPU time: 4.732445001602173 seconds                                            
100%|██████████| 15/15 [01:33<00:00,  6.25s/trial, best loss: 9.16678475900378]


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BEST HYPER-PARAMETERS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

{'ce__strategy': 'random_projection', 'est__max_depth': 3, 'fs__threshold': 0.05172919388966048, 'ne__numerical_strategy': 0}


AttributeError: 'str' object has no attribute 'keys'

In [19]:
opt = Optimiser(scoring="mean_squared_error", n_folds=3)
print("OUIIII" ,opt.evaluate(None, data))
sqrt(978096668)

  +str(self.to_path)+"/joblib'. Please clear it regularly.")


No parameters set. Default configuration is tested

##################################################### testing hyper-parameters... #####################################################

>>> NA ENCODER :{'numerical_strategy': 'mean', 'categorical_strategy': '<NULL>'}

>>> CA ENCODER :{'strategy': 'label_encoding'}

>>> ESTIMATOR :{'strategy': 'LightGBM', 'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 0.8, 'importance_type': 'split', 'learning_rate': 0.05, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 500, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 0.9, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'nthread': -1, 'seed': 0}


MEAN SCORE : neg_mean_squared_error = -978096668.7284681
VARIANCE : 363401763.5154683 (fold 1 = -819178020.6825377, fold 2 = -1480817035.214201, fold 3 = -634294950.2886657)
CPU time:

31274.53705492697

In [11]:
california_data = pd.read_csv("../Data/California_house/train.csv")
X = rd.train_test_split(paths, target_name)
y = california_data.SalePrice
# Make prediction and save the results in save folder.
prd = Predictor()
prd = prd.fit_predict(best, X)
sqrt(sklearn.metrics.mean_squared_error(y, prd.SalePrice_predicted))



reading csv : train.csv ...
cleaning data ...
CPU time: 0.5330097675323486 seconds

reading csv : test.csv ...
cleaning data ...
CPU time: 0.5509545803070068 seconds

> Number of common features : 80

gathering and crunching for train and test datasets ...
reindexing for train and test datasets ...
dropping training duplicates ...
dropping constant variables on training set ...

> Number of categorical features: 43
> Number of numerical features: 37
> Number of training samples : 1460
> Number of test samples : 1459

> Top sparse features (% missing values on train set):
PoolQC         99.5
MiscFeature    96.3
Alley          93.8
Fence          80.8
FireplaceQu    47.3
dtype: float64

> Task : regression
count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

fitting the pipeline ...


If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
  positive)


CPU time: 35.263222217559814 seconds

> Feature importances dumped into directory : save

predicting...
CPU time: 0.9136080741882324 seconds

> Overview on predictions : 

   SalePrice_predicted
0        123846.655010
1        163911.554538
2        186024.245125
3        189191.083244
4        188858.664274
5        178767.105389
6        169939.227833
7        164569.258979
8        183016.955618
9        126221.026997

dumping predictions into directory : save ...


TypeError: Expected sequence or array-like, got <class 'mlbox.prediction.predictor.Predictor'>

In [13]:
sqrt(sklearn.metrics.mean_squared_error(data, prd["SalePrice_predicted"]))

TypeError: 'Predictor' object is not subscriptable

In [None]:
#Regression
import numpy as np
import pandas as pd
import sklearn.metrics
from math import sqrt
from mlbox.preprocessing import Reader
from mlbox.preprocessing import Drift_thresholder
from mlbox.optimisation import make_scorer
from mlbox.optimisation import Optimiser
from mlbox.prediction import Predictor

# Paths to the train set and the test set.
paths = ["../Data/avocado_prices/avocado.csv"]
# Name of the feature to predict.
# This columns should only be present in the train set.
target_name = "AveragePrice"

# Reading and cleaning all files
# Declare a reader for csv files
rd = Reader(sep=',')
# Return a dictionnary containing three entries
# dict["train"] contains training samples withtout target columns
# dict["test"] contains testing elements withtout target columns
# dict["target"] contains target columns for training samples.
data = rd.train_test_split(paths, target_name)

dft = Drift_thresholder()
data = dft.fit_transform(data)

# Tuning
mape = make_scorer(lambda y_true,
                   y_pred: 100*np.sum(
                                      np.abs(y_true-y_pred)/y_true
                                      )/len(y_true),
                   greater_is_better=False,
                   needs_proba=False)
# Declare an optimiser. You can declare your own score
# as presented here or use one in
# {"neg_mean_absolute_error", "neg_mean_squared_error", "neg_mean_squared_log_error", "neg_median_absolute_error","r2"}
opt = Optimiser(scoring="mean_squared_error", n_folds=3)
print("OUIIII" ,opt.evaluate(None, data))

# Space of hyperparameters
# The keys must respect the following syntax : "enc__param".
#   "enc" = "ne" for na encoder
#   "enc" = "ce" for categorical encoder
#   "enc" = "fs" for feature selector [OPTIONAL]
#   "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
#   "enc" = "est" for the final estimator
#   "param" : a correct associated parameter for each step.
#   Ex: "max_depth" for "enc"="est", ...
# The values must respect the syntax: {"search":strategy,"space":list}
#   "strategy" = "choice" or "uniform". Default = "choice"
#   list : a list of values to be tested if strategy="choice".
#   Else, list = [value_min, value_max].
# Available strategies for ne_numerical_strategy are either an integer, a float
#   or in {'mean', 'median', "most_frequent"}
# Available strategies for ce_strategy are:
#   {"label_encoding", "dummification", "random_projection", entity_embedding"}
space = {
        'ne__numerical_strategy': {"search": "choice",
                                   "space": [0]},
        'ce__strategy': {"search": "choice",
                         "space": ["label_encoding",
                                   "random_projection",
                                   "entity_embedding"]},
        'fs__threshold': {"search": "uniform",
                          "space": [0.01, 0.3]},
        'est__max_depth': {"search": "choice",
                           "space": [3, 4, 5, 6, 7]}

        }

# Optimises hyper-parameters of the whole Pipeline with a given scoring
# function. Algorithm used to optimize : Tree Parzen Estimator.
#
# IMPORTANT : Try to avoid dependent parameters and to set one feature
# selection strategy and one estimator strategy at a time.
best = opt.optimise(space, data, 15)
print(best)

In [None]:
best = Optimiser(scoring="mean_squared_error", n_folds=3)
print("OUIIII" ,opt.evaluate(None, data))

In [None]:
fil = "Data/lol/high_diamond_ranked_10min.csv"
csvfilename = open(fil, 'r').readlines()
#store header values
header = csvfilename[0] 
#remove header from list
csvfilename.pop(0) 
file = 1
#Number of lines to be written in new file
record_per_file = 8000

for j in range(len(csvfilename)):
    if j % record_per_file == 0:
        write_file = csvfilename[j:j+record_per_file]
                #adding header at the start of the write_file
        write_file.insert(0, header)
        #write in file
        open(str(fil)+ str(file) + '.csv', 'w+').writelines(write_file)
        file+=1