In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor


from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import RobustScaler

In [2]:
diamonds = pd.read_csv('./data/diamonds.csv')

In [3]:
diamonds["size"] = diamonds["x"] * diamonds["y"] * diamonds["z"]

In [4]:
diamonds = diamonds.drop('x', axis=1).drop('y', axis=1).drop('z', axis=1)

In [5]:
diamonds['cut'] = diamonds['cut'].map({'Fair': 0, 
                                       'Good': 1, 
                                       'Very Good': 2, 
                                       'Premium': 3, 
                                       'Ideal': 4})

In [6]:
diamonds['color'] = diamonds['color'].map({'J': 0,
                                          'I': 1,
                                          'H': 2, 
                                          'G': 3,
                                          'F': 4, 
                                          'E': 5,
                                          'D': 6})

In [7]:
diamonds['continent'] = diamonds['city'].map({'Dubai': 'Asia',
                                                'Kimberly': 'Africa',
                                                'Las Vegas': 'America',
                                                'Tel Aviv': 'Asia',
                                                'Amsterdam': 'Europe',
                                                'Zurich': 'Europe',
                                                'Antwerp': 'Europe',
                                                'Madrid': 'Europe',
                                                'Paris': 'Europe',
                                                'Surat': 'Asia',
                                                'Luxembourg': 'Europe',
                                                'London': 'Europe',
                                                'New York City': 'America'})

In [8]:
diamonds = diamonds.drop('city', axis=1)

In [9]:
diamonds['clarity'] = diamonds['clarity'].replace(['IF'], ['VVSI']).replace(['VVS1'], ['VVSI']).replace(['VVS2'], ['VVSI'])
diamonds['clarity'] = diamonds['clarity'].replace(['VS1'], ['VSI']).replace(['VS2'], ['VSI'])
diamonds['clarity'] = diamonds['clarity'].replace(['SI1'], ['SI']).replace(['SI2'], ['SI'])

In [10]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   price      40455 non-null  int64  
 1   carat      40455 non-null  float64
 2   depth      40455 non-null  float64
 3   table      40455 non-null  float64
 4   cut        40455 non-null  int64  
 5   color      40455 non-null  int64  
 6   clarity    40455 non-null  object 
 7   size       40455 non-null  float64
 8   continent  40455 non-null  object 
dtypes: float64(4), int64(3), object(2)
memory usage: 2.8+ MB


In [11]:
diamonds_dummy = pd.get_dummies(diamonds, drop_first=True, dtype=float)

In [12]:
X = diamonds_dummy.drop('price', axis=1)
y = diamonds_dummy['price']

In [15]:
robust_scaler = RobustScaler()

## Cross-validation

In [16]:
model = RandomForestRegressor(random_state = 42)

In [17]:
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5,
                         n_jobs=-1)

In [18]:
print(type(model), '\n')
print(scores, '\n')
print(np.mean(-scores), '\n')

<class 'sklearn.ensemble._forest.RandomForestRegressor'> 

[-675.9513868  -653.03835311 -671.58230241 -684.45653388 -651.2100234 ] 

667.2477199207655 



In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_rmse = RandomForestRegressor(random_state = 42)


model_rmse.fit(X_train, y_train)
y_pred = model_rmse.predict(X_test)

hyperparameters = model_rmse.get_params()

rmse = mean_squared_error(y_test, y_pred)**0.5

print(type(model_rmse), '\n')
print(rmse, '\n')

<class 'sklearn.ensemble._forest.RandomForestRegressor'> 

667.0051113006967 



## GridSearchCV()

In [20]:
param_grid = {'n_estimators': [100, 200, 300],  # Number of trees in the forest.
              'max_depth': [None, 3, 10],  # Maximum depth of the trees.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split.
              }

In [21]:
grid_search = GridSearchCV(model,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

In [22]:
grid_search.fit(X,y)     # Creo que hay que hacerlo sobre X_train e y_train

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Best hyperparameters:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300} 

Best score:  664.8881203852768 



In [23]:
grid_search.fit(X_train, y_train)     # Creo que hay que hacerlo sobre X_train e y_train

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Best hyperparameters:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300} 

Best score:  678.082891992378 



In [24]:
print(grid_search.best_estimator_)

RandomForestRegressor(max_features=None, min_samples_split=10, n_estimators=300,
                      random_state=42)


In [25]:
grid_search.cv_results_

{'mean_fit_time': array([22.58884873, 45.57221031, 70.29710279, 16.9924808 , 34.21158757,
        51.05044374, 15.7138135 , 31.46161227, 47.2771028 , 15.35834384,
        30.68522353, 46.67408042,  8.99487228, 18.27811475, 27.56674685,
         5.69955745, 11.51235614, 17.70863905,  5.26867161, 11.89698925,
        17.04384856,  5.42352052, 10.43379421, 15.48490434,  9.31982188,
        18.45241122, 27.62583094,  5.69808407, 11.47160425, 17.310113  ,
         5.33924084, 10.76986604, 16.03149772,  5.16658182, 10.39827933,
        15.37518249,  3.79178534,  7.46744833, 11.10693307,  3.67452512,
         7.49551592, 11.82571502,  3.92557497,  7.51928248, 11.08240337,
         3.63784423,  7.36831903, 11.21454124,  1.297963  ,  2.69722881,
         4.06028295,  1.39476342,  2.664608  ,  4.14851384,  1.41722755,
         2.73753605,  4.21138873,  1.39276137,  2.71743522,  4.14589944,
         1.37238765,  2.82709103,  4.10452552,  1.31151814,  2.76302252,
         4.09095507,  1.33812399, 

## Second GridSearchCV

In [26]:
param_grid_2 = {'n_estimators': [100, 350, 500],  # Number of trees in the forest.
              'max_depth': [None, 3, 10],  # Maximum depth of the trees.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split.
              }

In [27]:
grid_search_2 = GridSearchCV(model,
                           param_grid_2,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

In [28]:
grid_search_2.fit(X_train, y_train)     # Creo que hay que hacerlo sobre X_train e y_train

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Best hyperparameters:  {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300} 

Best score:  678.082891992378 



# GridSearchCV ExtraTreesRegressor

In [2]:
model = ExtraTreesRegressor

In [None]:
gsc = GridSearchCV(
    estimator = model, 
    param_grid={
        'n_estimator': range(50,120,150),
        'max_features': range(50,120,150), 
        'min_samples_leaf': range(20,50,5), 
        'min_samples_split': range(15,36,5),
    },
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=3)