In [13]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV


In [16]:
diamonds = pd.read_csv('./data/diamonds.csv')

In [17]:
diamonds["size"] = diamonds["x"] * diamonds["y"] * diamonds["z"]

In [18]:
diamonds = diamonds.drop('x', axis=1).drop('y', axis=1).drop('z', axis=1)

In [19]:
diamonds['cut'] = diamonds['cut'].map({'Fair': 1, 
                                       'Good': 2, 
                                       'Very Good': 3, 
                                       'Premium': 4, 
                                       'Ideal': 5})

In [20]:
diamonds['color'] = diamonds['color'].map({'J': 1,
                                          'I': 2,
                                          'H': 3, 
                                          'G': 4,
                                          'F': 5, 
                                          'E': 6,
                                          'D': 7})

In [21]:
diamonds['continent'] = diamonds['city'].map({'Dubai': 'Asia',
                                                'Kimberly': 'Africa',
                                                'Las Vegas': 'America',
                                                'Tel Aviv': 'Asia',
                                                'Amsterdam': 'Europe',
                                                'Zurich': 'Europe',
                                                'Antwerp': 'Europe',
                                                'Madrid': 'Europe',
                                                'Paris': 'Europe',
                                                'Surat': 'Asia',
                                                'Luxembourg': 'Europe',
                                                'London': 'Europe',
                                                'New York City': 'America'})

In [22]:
diamonds = diamonds.drop('city', axis=1)

In [23]:
diamonds['continent'] = diamonds['continent'].map({'Africa': 1,
                                          'Europe': 2,
                                          'America': 3, 
                                          'Asia': 4})

In [24]:
diamonds['clarity'] = diamonds['clarity'].map({'I1': 1,
                                          'SI1': 2,
                                          'SI2': 3, 
                                          'VS1': 4,
                                          'VS2': 5, 
                                          'VVS1': 6,
                                          'VVS2': 7,
                                                'IF': 8})

In [25]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   price      40455 non-null  int64  
 1   carat      40455 non-null  float64
 2   depth      40455 non-null  float64
 3   table      40455 non-null  float64
 4   cut        40455 non-null  int64  
 5   color      40455 non-null  int64  
 6   clarity    40455 non-null  int64  
 7   size       40455 non-null  float64
 8   continent  40455 non-null  int64  
dtypes: float64(4), int64(5)
memory usage: 2.8 MB


In [26]:
X = diamonds.drop('price', axis=1)
y = diamonds['price']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
Invalid parameter 'base_estimator' for estimator BaggingRegressor(). Valid parameters are: ['bootstrap', 'bootstrap_features', 'estimator', 'max_features', 'max_samples', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].

In [30]:
param_grid = {
    'estimator': [DecisionTreeRegressor(), RandomForestRegressor()],
    'n_estimators': [100, 300, 500],
    'max_samples': [0.8, 1.0, 1.5],
    'max_features': [0.8, 1.0, 1.5],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False],
    'oob_score': [True, False],
}


In [31]:
model = BaggingRegressor()

In [32]:
model.get_params()

{'bootstrap': True,
 'bootstrap_features': False,
 'estimator': None,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [33]:
grid_search = GridSearchCV(estimator=model, 
                           param_grid=param_grid, 
                           scoring='neg_mean_squared_error', 
                           cv=5,
                           verbose=3,
                           n_jobs=-1)


In [None]:
grid_search.fit(X_train,y_train)    

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
