In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

# Description

A continuación, voy a aplicar el modelo RandomForest, SIN escalar. 
Sin embargo, vamos a aplicar cambios significativos en el dataset de diamonds original: 
- Calcular el tamaño como la multiplicación de las columnas x, y, z.
- Eliminamos posteriormente estas columnas x, y, z.
- Agrupación de la columna de colores donde J es el peor y D es el mejor, y label enconding. 
- La columna de cut, en lugar de hacer un one hot encoding, haremos un label enconding donde 0 sea Fair y 4 sea Ideal. 
- Elimino la columna de city.  
- Elimino la columna de depth.
- La columna de clarity aplico label enconding. 




In [2]:
diamonds = pd.read_csv('./data/diamonds.csv')

In [3]:
diamonds.head()

Unnamed: 0,price,carat,city,depth,table,x,y,z,cut,color,clarity
0,4268,1.21,Dubai,62.4,58.0,6.83,6.79,4.25,Premium,J,VS2
1,505,0.32,Kimberly,63.0,57.0,4.35,4.38,2.75,Very Good,H,VS2
2,2686,0.71,Las Vegas,65.5,55.0,5.62,5.53,3.65,Fair,G,VS1
3,738,0.41,Kimberly,63.8,56.0,4.68,4.72,3.0,Good,D,SI1
4,4882,1.02,Dubai,60.5,59.0,6.55,6.51,3.95,Ideal,G,SI1


In [4]:
diamonds["size"] = diamonds["x"] * diamonds["y"] * diamonds["z"]

In [5]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   city     40455 non-null  object 
 3   depth    40455 non-null  float64
 4   table    40455 non-null  float64
 5   x        40455 non-null  float64
 6   y        40455 non-null  float64
 7   z        40455 non-null  float64
 8   cut      40455 non-null  object 
 9   color    40455 non-null  object 
 10  clarity  40455 non-null  object 
 11  size     40455 non-null  float64
dtypes: float64(7), int64(1), object(4)
memory usage: 3.7+ MB


In [6]:
diamonds = diamonds.drop('x', axis=1).drop('y', axis=1).drop('z', axis=1)

In [7]:
diamonds['cut'].value_counts()

cut
Ideal        16220
Premium      10260
Very Good     9095
Good          3663
Fair          1217
Name: count, dtype: int64

In [8]:
diamonds['cut'] = diamonds['cut'].map({'Fair': 0, 
                                       'Good': 1, 
                                       'Very Good': 2, 
                                       'Premium': 3, 
                                       'Ideal': 4})

In [9]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   city     40455 non-null  object 
 3   depth    40455 non-null  float64
 4   table    40455 non-null  float64
 5   cut      40455 non-null  int64  
 6   color    40455 non-null  object 
 7   clarity  40455 non-null  object 
 8   size     40455 non-null  float64
dtypes: float64(4), int64(2), object(3)
memory usage: 2.8+ MB


In [14]:
diamonds['color'].value_counts()

color
3    8455
5    7325
4    7177
2    6277
6    5049
1    4032
0    2140
Name: count, dtype: int64

In [15]:
diamonds['color'] = diamonds['color'].map({'J': 0,
                                          'I': 1,
                                          'H': 2, 
                                          'G': 3,
                                          'F': 4, 
                                          'E': 5,
                                          'D': 6})


In [14]:
diamonds = diamonds.drop('city', axis=1)

In [16]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   depth    40455 non-null  float64
 3   table    40455 non-null  float64
 4   cut      40455 non-null  int64  
 5   color    0 non-null      float64
 6   clarity  40455 non-null  object 
 7   size     40455 non-null  float64
dtypes: float64(5), int64(2), object(1)
memory usage: 2.5+ MB


In [19]:
diamonds['clarity'].value_counts()

clarity
SI1     9749
VS2     9248
SI2     6929
VS1     6066
VVS2    3799
VVS1    2774
IF      1327
I1       563
Name: count, dtype: int64

In [20]:
diamonds['clarity'] = diamonds['clarity'].map({'I1': 0,
                                          'SI1': 1,
                                          'SI2': 2, 
                                          'VS2': 3,
                                          'VS1': 4, 
                                          'VVS2': 5,
                                          'VVS1': 6,
                                              'IF': 7})

In [21]:
diamonds['clarity'].value_counts()

clarity
1    9749
3    9248
2    6929
4    6066
5    3799
6    2774
7    1327
0     563
Name: count, dtype: int64

In [22]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   depth    40455 non-null  float64
 3   table    40455 non-null  float64
 4   cut      40455 non-null  int64  
 5   color    0 non-null      float64
 6   clarity  40455 non-null  int64  
 7   size     40455 non-null  float64
dtypes: float64(5), int64(3)
memory usage: 2.5 MB


## ⚠️⚠️ MODEL = RANDOMFOREST CON PARÁMETROS GRIDSEARCHCV A 300

In [23]:
#model = RandomForestRegressor(max_features=None, min_samples_split=10, n_estimators=300,
                      random_state=42)
model = LinearR()

In [24]:
hyperparameters = model.get_params()
print(hyperparameters)

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 300, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [26]:
X = diamonds.drop('price', axis=1)
y = diamonds['price']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
model.fit(X_train, y_train)

In [29]:
predict_RF_Grid = model.predict(X_test)

In [30]:
check_RF_Grid = pd.DataFrame({'Ground truth':y_test, 'Predictions':predict_RF_Grid, 'Diff':y_test-predict_RF_Grid})
check_RF_Grid

Unnamed: 0,Ground truth,Predictions,Diff
17775,2970,2680.932996,289.067004
13506,3004,2111.663789,892.336211
4325,838,816.894285,21.105715
37870,6468,5644.128975,823.871025
21321,633,609.142294,23.857706
...,...,...,...
3781,4764,4710.070660,53.929340
26959,756,592.144180,163.855820
15529,2690,2739.731652,-49.731652
36333,3992,4173.451707,-181.451707


In [31]:
# RMSE calculation

rmse_RF_Grid = mean_squared_error(y_test, predict_RF_Grid, squared=False)
rmse_RF_Grid



959.442743187279

# 🚫🚫 FUNCIONA MUCHO PEOR

# DESCRIPTION: nuevos cambios

A continuación, voy a aplicar el modelo RandomForest, SIN escalar. 
- NO calculamos size y mantenemos las columnas x, y, z. 
- Eliminamos la columna de city.
- Eliminamos la columna de depth. 
- La columna de clarity le aplico label enconding de forma totalmente contraria a la anterior. 
- Agrupación de la columna de colores donde J es el peor y D es el mejor, y label enconding. 
- La columna de cut, en lugar de hacer un one hot encoding, haremos un label enconding donde 0 sea Fair y 4 sea Ideal.

In [32]:
diamonds_2 = pd.read_csv('./data/diamonds.csv')

In [34]:
diamonds_2 = diamonds_2.drop('city', axis=1)

In [35]:
diamonds_2 = diamonds_2.drop('depth', axis=1)

In [36]:
diamonds_2['clarity'] = diamonds_2['clarity'].map({'I1': 7,
                                          'SI1': 6,
                                          'SI2': 5, 
                                          'VS2': 4,
                                          'VS1': 3, 
                                          'VVS2': 2,
                                          'VVS1': 1,
                                              'IF': 0})

In [37]:
diamonds_2['cut'] = diamonds_2['cut'].map({'Fair': 0, 
                                       'Good': 1, 
                                       'Very Good': 2, 
                                       'Premium': 3, 
                                       'Ideal': 4})

In [38]:
diamonds_2['color'] = diamonds_2['color'].map({'J': 0,
                                          'I': 1,
                                          'H': 2, 
                                          'G': 3,
                                          'F': 4, 
                                          'E': 5,
                                          'D': 6})

In [39]:
diamonds_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   price    40455 non-null  int64  
 1   carat    40455 non-null  float64
 2   table    40455 non-null  float64
 3   x        40455 non-null  float64
 4   y        40455 non-null  float64
 5   z        40455 non-null  float64
 6   cut      40455 non-null  int64  
 7   color    40455 non-null  int64  
 8   clarity  40455 non-null  int64  
dtypes: float64(5), int64(4)
memory usage: 2.8 MB


In [40]:
diamonds_2.describe()

Unnamed: 0,price,carat,table,x,y,z,cut,color,clarity
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,3928.444469,0.797706,57.446133,5.729392,5.732819,3.537154,2.904783,3.400766,4.020319
std,3992.416147,0.475544,2.233535,1.124453,1.14665,0.697062,1.117876,1.70126,1.711039
min,326.0,0.2,43.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,945.0,0.4,56.0,4.71,4.72,2.91,2.0,2.0,3.0
50%,2397.0,0.7,57.0,5.69,5.71,3.52,3.0,3.0,4.0
75%,5331.0,1.04,59.0,6.54,6.54,4.035,4.0,5.0,6.0
max,18823.0,4.5,95.0,10.23,58.9,8.06,4.0,6.0,7.0


In [41]:
X_2 = diamonds_2.drop('price', axis=1)
y_2 = diamonds_2['price']