In [3]:
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer

from mlxtend.plotting import plot_decision_regions

if str(os.getcwdb()[-3:]).split("'")[1] != 'src':
    os.chdir(os.path.dirname(os.getcwdb()))

from utils.modeling import *
from utils.functions import *


In [4]:
df_diamonds = pd.read_csv(r'data\processed\diamonds_training.csv', index_col='id')
df_predict = pd.read_csv(r'data\processed\diamonds_testing.csv', index_col='id')


# Consideraciones
- Se localiza el tema en Kaggle: https://www.kaggle.com/datasets/shivam2503/diamonds

- Se empieza a trabajar con ese "dataset" (ver los archivos marcados como "UNUSED" y "no competition")

- Se detecta que existe una competición, si bien ya ha terminado: https://www.kaggle.com/competitions/diamonds-part-datamad0122/overview

- Se elige trabajar con los archivos de la competición, cuyas únicas diferencias es que hay un "train" y un "test", y que la variable "target" está escalada

- El "dataset" final es un listado de diamantes con sus características, y el objetivo es predecir el precio

- Se comparará lo obtenido con los resultados de la competición

# EDA
- Los pasos de esta primera parte se detallan de forma más pormenorizada, paso a paso, en el "notebook" titulado "EDA_diamonds"

- En ese "notebook" se hacen dos cosas:
1) Modificaciones esenciales; se liquidan duplicados, se cambia el nombre de las columnas y se pasan las categóricas a numéricas, tanto del "train" como del "test".

2) Modificaciones opcionales; se detectan y ponen a prueba las posibles modifiaciones que llevar a cabo con el "dataframe" de entrenamiento con tal de mejorar el resultado de los modelos. Los resortes de dichos cambios se guardan en forma de funciones (cuando son exclusivos de este proyecto) o clases (cuando es razonable guardarlos para análisis futuros), que se irán llamando a continuación según convenga.

# Modelaje: selección de cambios
- Se importan los "dataframes" con las modificaciones esenciales

- Se van intercalando modificiaciones opcionales y diversos modelos hasta dar con el mejor resultado

- Los modelos se prueban en este "notebook" para mayor comodidad, pero se ejecutan sin detallarse en "train.py", desde donde se guardan en la carpeta "model"

- Estas son las modificaciones que se van intercalando:

---------- Cambios opcionales (probados) ---------- 

1) Borrado de "outliers" extremadamente altos ("depth (percentage)", "table (percentage)", "width (millimeters)", "depth (millimeters)").

2) Borrado de filas que tienen 0 en todas las variables de tamaño ("lenght (millimeters)", "width (millimeters)" y "depth (millimeters)").

3) Borrado de los "outliers" compartidos moderadamente altos ("depth (percentage)" y "table (percentage)").

4) Asignación del valor con 0 restante en "lenght" al "width" correspondiente ("lenght (millimeters)").

5) Asignación del valor con 0 restante de "depth (millimeters)" a partir de una operación con el "lenght", el "width" y el "depth (percentage)" correspondientes ("depth (millimeters)").

6) Asignación del "outlier" restante del "lenght" al "width" correspondiente ("lenght (millimeters)").

7) Uso del logaritmo ("weight (carat)", "lenght (millimeters)", "width (millimeters)" y "depth (millimeters)").

8) Imputación al siguiente valor más alto ("weight (carat)").

9) Imputación a los valores máximos y mínimos del "boxplot" ("depth (percentage)" y "table (percentage)").

10) Neutralización de "outliers" con un modelo "ridge" ("depth (millimeters)").

11) Escalado "MinMax".

---------- Cambios apuntados (no probados) ----------

1) Sustitución de valores existentes por valores calculados ("depth (percentage)").

2) Descarte de las columnas con altísima correlación ("weight (carat)", "lenght (millimeters)", "width (millimeters)" y "depth (millimeters)").

3) Imputación de los valores máximos de "clarity quality" al que está un punto por debajo ("clarity quality").


## Ronda 1: sin cambios
- Para la primera fase, se prueban todos los modelos sin hacer ninguna modificación adicional

- En esta primera ronda están más detallados los usos de la clase "Regression", que hereda de "Model", para que sirva como ejemplo

- Como era de esperar, los resultados no son demasiado buenos, pero ganan los modelos "de árboles", ya que no se ven afectados por los valores atípicos

- Dado que en la competición se valora la "rsme", esa es la métrica que más se tendrá en cuenta. Puede verse el podio aquí: https://www.kaggle.com/competitions/diamonds-part-datamad0122/leaderboard

In [5]:
# Lo primero es decirle a la clase con qué modelos se va a trabajar a lo largo de todo el proceso
Regression.add_models(['LinearRegression',
                        'Ridge',
                        'DecisionTreeRegressor',
                        'KNeighborsRegressor',
                        'RandomForestRegressor',
                        'SVR',
                        'XGBRegressor'
                        ]
                        )


In [4]:
# Se crea la instancia de la clase "Regression" con la columna "price" como "target"
round_1 = Regression(df_diamonds, 'price')


In [5]:
# Se separa el "dataframe" con los parámetros por defecto. Se guardan las porciones por si acaso
X_train, X_test, y_train, y_test = round_1.split_dataframe()


In [6]:
# Se solicitan 10 "folds", del cual se usará el mejor para comparar los modelos y ver cuál llega más lejos
# Se coge el mejor, y no la media de las métricas, porque de 10 cortes es probable que salgan números de "cortes malos" dispares para los diferentes modelos
# Si fuera el caso, la comparación con las medias seria injusta
# En cambio, con el mejor, es muy probable que de 10 cortes al menos uno de ellos saque el máximo partido a cada modelo
# Así, se comparan en mayor igualdad de condiciones
# Como la "target" es de regresión, la instancia seleccionará automáticamente "KFold" en lugar de "StratifiedFold"
# Se establece un "random_state" para los modelos que lo requieren, que siempre será el mismo
round_1_dict = round_1.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                                    ['RandomForestRegressor', 'random_state=43'],
                                                    ['XGBRegressor', 'random_state=43']
                                                ],
                                    kfolds_num=10
                                    )


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.88 sec(s). Total time: 0.88
Starting Ridge:
- Ridge done in 0.41 sec(s). Total time: 1.29
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 15.74 sec(s). Total time: 17.03
Starting SVR:
- SVR done in 351.86 sec(s). Total time: 368.89
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.64 sec(s). Total time: 371.53
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 229.65 sec(s). Total time: 601.19
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 31.02 sec(s). Total time: 632.21


In [7]:
# Los resultados, así como el modelo entrenado, pueden visualizarse en un diccionario
round_1_dict


In [8]:
# Acto seguido, se miran las métricas
round_1_metrics = round_1.evaluate_metrics()

round_1_metrics


{'LinearRegression': {'test': array([8.069, 9.093, 8.297, ..., 9.234, 8.818, 8.368]),
  'prediction': array([8.1751789 , 8.94534424, 7.94609508, ..., 9.13303572, 8.78005708,
         8.22465851]),
  'model': LinearRegression(),
  'metrics': {'rmse': 0.222037823248969,
   'mse': 0.0493007949531404,
   'mae': 0.12289980638635319,
   'r2_score': 0.9529090797126748,
   'mape': 0.01587291534328809}},
 'Ridge': {'test': array([8.069, 9.093, 8.297, ..., 9.234, 8.818, 8.368]),
  'prediction': array([8.17485364, 8.94499768, 7.94593798, ..., 9.13327901, 8.78003269,
         8.22426296]),
  'model': Ridge(),
  'metrics': {'rmse': 0.2219231703319744,
   'mse': 0.04924989353019452,
   'mae': 0.1229616970889257,
   'r2_score': 0.9529576995138916,
   'mape': 0.015880870501013333}},
 'KNeighborsRegressor': {'test': array([8.069, 9.093, 8.297, ..., 9.234, 8.818, 8.368]),
  'prediction': array([8.2228, 9.072 , 8.0682, ..., 9.09  , 8.7678, 8.706 ]),
  'model': KNeighborsRegressor(),
  'metrics': {'rmse':

In [9]:
# Para una mejor visualización, se ponen en un "dataframe"
# Las predicciones no son muy buenas, si bien el r2_score es alto para todos los casos
round_1.create_dataframe()


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.222038,0.221923,0.183865,0.209064,0.130142,0.098101,0.094787,XGBRegressor: random_state=43,LinearRegression
mse,0.049301,0.04925,0.033806,0.043708,0.016937,0.009624,0.008985,XGBRegressor: random_state=43,LinearRegression
mae,0.1229,0.122962,0.135156,0.127477,0.088698,0.067123,0.066666,XGBRegressor: random_state=43,KNeighborsRegressor
r2_score,0.952909,0.952958,0.967709,0.958252,0.983822,0.990808,0.991418,XGBRegressor: random_state=43,SVR
mape,0.015873,0.015881,0.017806,0.016523,0.011417,0.008684,0.008602,XGBRegressor: random_state=43,KNeighborsRegressor


## Ronda 2: escalado
- Se repite la ronda 1, pero esta vez se escalan las variables

- Exceptuando "Ridge", el escalado "Standard" mejora más que "MinMax" los resultados de los modelos "no de árboles", que solo empeoran. La regresión lineal no se ve afectada en ningún caso

In [10]:
# Se pone a prueba con el mismo proceso que en la ronda anterior, pero esta vez se aplica un escalado MinMax
# Hay una ligera mejora en los modelos "no de árboles"
df_diamonds_2 = df_diamonds.copy()

round_2 = Regression(df_diamonds_2, 'price')
round_2.split_dataframe(scaler='MinMaxScaler')
round_2.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_2.evaluate_metrics()
round_2.create_dataframe()


-- Regression (MinMaxScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.24 sec(s). Total time: 0.24
Starting Ridge:
- Ridge done in 0.17 sec(s). Total time: 0.41
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 3.38 sec(s). Total time: 3.8
Starting SVR:
- SVR done in 285.42 sec(s). Total time: 289.22
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.7 sec(s). Total time: 291.92
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 245.17 sec(s). Total time: 537.09
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 10.62 sec(s). Total time: 547.7


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.222038,0.217754,0.165586,0.136532,0.130527,0.098128,0.094788,XGBRegressor: random_state=43,LinearRegression
mse,0.049301,0.047417,0.027419,0.018641,0.017037,0.009629,0.008985,XGBRegressor: random_state=43,LinearRegression
mae,0.1229,0.127961,0.115148,0.093104,0.088925,0.067186,0.066664,XGBRegressor: random_state=43,Ridge
r2_score,0.952909,0.954709,0.97381,0.982195,0.983726,0.990803,0.991418,XGBRegressor: random_state=43,LinearRegression
mape,0.015873,0.016526,0.015207,0.012264,0.011447,0.008692,0.008602,XGBRegressor: random_state=43,Ridge


In [11]:
# Comprobamos si la situación mejora con "StandardScaler". En efecto, es así (salvo para Ridge, que empeora)
df_diamonds_2b = df_diamonds.copy()

round_2b = Regression(df_diamonds_2b, 'price')
round_2b.split_dataframe(scaler='StandardScaler')
round_2b.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_2b.evaluate_metrics()
round_2b.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.29 sec(s). Total time: 0.29
Starting Ridge:
- Ridge done in 0.14 sec(s). Total time: 0.43
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.56 sec(s). Total time: 4.99
Starting SVR:
- SVR done in 314.16 sec(s). Total time: 319.15
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.76 sec(s). Total time: 321.91
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 235.85 sec(s). Total time: 557.76
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 10.85 sec(s). Total time: 568.6


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.222038,0.221959,0.14442,0.110864,0.130774,0.098193,0.094777,XGBRegressor: random_state=43,LinearRegression
mse,0.049301,0.049266,0.020857,0.012291,0.017102,0.009642,0.008983,XGBRegressor: random_state=43,LinearRegression
mae,0.1229,0.122939,0.10546,0.083091,0.089279,0.067219,0.066659,XGBRegressor: random_state=43,Ridge
r2_score,0.952909,0.952942,0.980078,0.98826,0.983665,0.99079,0.99142,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.015873,0.015878,0.013921,0.010905,0.011496,0.008697,0.008601,XGBRegressor: random_state=43,Ridge


## Ronda 3: borrado (+ escalado)
- Se borran todos los "outliers" extremadamente altos ("depth (percentage)", "table (percentage)", "width (millimeters)", "depth (millimeters)")

- Se borran las filas que tienen el mismo cero en estas tres columnas: "lenght (millimeters)", "width (millimeters)" y "depth (millimeters)"

- Se borran los "outliers" moderadamente altos compartidos de "depth (percentage)" y "table (percentage)"

- El error mejora en todos los modelos en distintas medidas

In [12]:
# Se aplican los borrados, esta vez con una función a medida ya que son cambios exclusivos de este proyecto
df_diamonds_3 = df_diamonds.copy()

df_diamonds_3 = remove_all(df_diamonds_3)

print(f'Deleted rows: {len(df_diamonds) - len(df_diamonds_3)}')


Deleted rows: 20


In [13]:
# Se escala y se prueban los modelos
round_3 = Regression(df_diamonds_3, 'price')
round_3.split_dataframe(scaler='StandardScaler')
round_3.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_3.evaluate_metrics()
round_3.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.28 sec(s). Total time: 0.28
Starting Ridge:
- Ridge done in 0.14 sec(s). Total time: 0.42
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.73 sec(s). Total time: 5.15
Starting SVR:
- SVR done in 339.47 sec(s). Total time: 344.62
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 3.08 sec(s). Total time: 347.7
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 271.57 sec(s). Total time: 619.26
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 17.66 sec(s). Total time: 636.92


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.145078,0.145065,0.139798,0.105232,0.126619,0.092584,0.090504,XGBRegressor: random_state=43,LinearRegression
mse,0.021048,0.021044,0.019543,0.011074,0.016032,0.008572,0.008191,XGBRegressor: random_state=43,LinearRegression
mae,0.112841,0.112834,0.105272,0.081063,0.088104,0.066677,0.065687,XGBRegressor: random_state=43,LinearRegression
r2_score,0.979612,0.979616,0.98107,0.989273,0.984471,0.991697,0.992066,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014806,0.014805,0.013886,0.010641,0.011362,0.008625,0.008474,XGBRegressor: random_state=43,LinearRegression


## Ronda 4: asignación (+ borrado y escalado)
- Los cambios de esta ronda se aplican a partir de dos hechos probados durante el EDA: 

1) El "lenght" y el "width" son generalmente casi idénticos, ya que los diamantes son semicirculares.

2) El "depth (percentage)" se obtiene (según el autor del "dataset") de dividir "depth (millimeters)" por la media de "lenght" y "width".

- Se asigna el 0 restante de "lenght" al "width" correspondiente ("lenght (millimeters)")

- Se asigna el 0 restante de "depth (millimeters)" a partir de la operación mencionada

- Se asigna el "outlier" restante del "lenght" al "width" correspondiente ("lenght (millimeters)")

- Mejoran todos menos "DecisionTree" en pequeña medida 

In [14]:
# Se aplican el borrado y la asignación
df_diamonds_4 = df_diamonds.copy()

df_diamonds_4 = remove_all(df_diamonds_4)
df_diamonds_4 = assign_values(df_diamonds_4)


In [15]:
# Se escala y se prueban los modelos
round_4 = Regression(df_diamonds_4, 'price')
round_4.split_dataframe(scaler='StandardScaler')
round_4.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_4.evaluate_metrics()
round_4.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.24 sec(s). Total time: 0.25
Starting Ridge:
- Ridge done in 0.17 sec(s). Total time: 0.41
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 6.01 sec(s). Total time: 6.42
Starting SVR:
- SVR done in 326.11 sec(s). Total time: 332.53
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 3.22 sec(s). Total time: 335.74
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 279.61 sec(s). Total time: 615.36
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 27.84 sec(s). Total time: 643.2


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144352,0.144347,0.139518,0.105131,0.126641,0.092534,0.09037,XGBRegressor: random_state=43,LinearRegression
mse,0.020838,0.020836,0.019465,0.011053,0.016038,0.008563,0.008167,XGBRegressor: random_state=43,LinearRegression
mae,0.112161,0.112159,0.105166,0.081034,0.088464,0.066706,0.065605,XGBRegressor: random_state=43,LinearRegression
r2_score,0.979816,0.979817,0.981145,0.989294,0.984465,0.991706,0.992089,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014685,0.014685,0.013874,0.010637,0.011401,0.008629,0.008473,XGBRegressor: random_state=43,LinearRegression


## Ronda 5: logaritmo (+ asignación, borrado y escalado)
- Se aplica el logaritmo a las columnas "weight (carat)", "lenght (millimeters)", "width (millimeters)" y "depth (millimeters)"

- Mejoran "KNeighbors",  "SVR" y "DecisionTree" (este último, muy poco). "XGBRegressor" se queda igual. El resto, empeoran

In [16]:
# Se hacen los retoques
df_diamonds_5 = df_diamonds.copy()

df_diamonds_5 = remove_all(df_diamonds_5)
df_diamonds_5 = assign_values(df_diamonds_5)

df_diamonds_5[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_diamonds_5[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])


In [17]:
# Se escala y se prueban los modelos
round_5 = Regression(df_diamonds_5, 'price')
round_5.split_dataframe(scaler='StandardScaler')
round_5.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_5.evaluate_metrics()
round_5.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.25 sec(s). Total time: 0.25
Starting Ridge:
- Ridge done in 0.18 sec(s). Total time: 0.43
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 6.49 sec(s). Total time: 6.91
Starting SVR:
- SVR done in 363.74 sec(s). Total time: 370.66
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.61 sec(s). Total time: 373.27
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 232.83 sec(s). Total time: 606.1
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 13.75 sec(s). Total time: 619.85


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144715,0.144684,0.137727,0.103949,0.126594,0.092607,0.09037,XGBRegressor: random_state=43,LinearRegression
mse,0.020942,0.020933,0.018969,0.010805,0.016026,0.008576,0.008167,XGBRegressor: random_state=43,LinearRegression
mae,0.112273,0.112272,0.104003,0.079632,0.088505,0.066736,0.065606,XGBRegressor: random_state=43,LinearRegression
r2_score,0.979715,0.979723,0.981626,0.989533,0.984477,0.991693,0.992089,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014703,0.014703,0.013615,0.010411,0.011404,0.008633,0.008473,XGBRegressor: random_state=43,Ridge


## Ronda 6: imputaciones "boxplot" y valor más alto (+ logaritmo, asignación, borrado y escalado)
- Se imputan al siguiente valor más alto de "weight", y al máximo y al mínimo "depth (percentage)" y table

- Se prueban por separado

- Todos menos los "de árboles" mejoran con la imputación al "boxplot", pero ninguno con la del valor máximo, con lo que se obviará completamente


In [43]:
# Se hacen los retoques pertinentes
df_diamonds_6a = df_diamonds.copy()

df_diamonds_6a = remove_all(df_diamonds_6a)
df_diamonds_6a = assign_values(df_diamonds_6a)

df_diamonds_6a[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_diamonds_6a[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_diamonds_6b = df_diamonds_6a.copy()

df_diamonds_6a = impute_next_higher(df_diamonds_6a)


In [44]:
# Se lleva a cabo la prueba
round_6a = Regression(df_diamonds_6a, 'price')
round_6a.split_dataframe(scaler='StandardScaler')
round_6a.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_6a.evaluate_metrics()
round_6a.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.39 sec(s). Total time: 0.39
Starting Ridge:
- Ridge done in 0.17 sec(s). Total time: 0.56
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 5.65 sec(s). Total time: 6.21
Starting SVR:
- SVR done in 292.21 sec(s). Total time: 298.43
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.66 sec(s). Total time: 301.09
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 227.26 sec(s). Total time: 528.35
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 10.4 sec(s). Total time: 538.74


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144734,0.144699,0.137727,0.103957,0.126678,0.092642,0.09037,XGBRegressor: random_state=43,LinearRegression
mse,0.020948,0.020938,0.018969,0.010807,0.016047,0.008583,0.008167,XGBRegressor: random_state=43,LinearRegression
mae,0.112266,0.112263,0.104003,0.079636,0.088566,0.066756,0.065606,XGBRegressor: random_state=43,LinearRegression
r2_score,0.979709,0.979719,0.981626,0.989532,0.984456,0.991687,0.992089,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014703,0.014702,0.013615,0.010412,0.01141,0.008636,0.008473,XGBRegressor: random_state=43,LinearRegression


In [45]:
df_diamonds_6b = impute_boxplot_min_max(df_diamonds_6b, ['depth (percentage)', 'table (percentage)'])


In [46]:
# Se lleva a cabo la prueba
round_6b = Regression(df_diamonds_6b, 'price')
round_6b.split_dataframe(scaler='StandardScaler')
round_6b.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_6b.evaluate_metrics()
round_6b.create_dataframe()

-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.25 sec(s). Total time: 0.25
Starting Ridge:
- Ridge done in 0.14 sec(s). Total time: 0.39
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.47 sec(s). Total time: 4.86
Starting SVR:
- SVR done in 284.1 sec(s). Total time: 288.95
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.73 sec(s). Total time: 291.69
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 223.98 sec(s). Total time: 515.66
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 10.68 sec(s). Total time: 526.34


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144535,0.144513,0.136072,0.103508,0.127979,0.092796,0.090042,XGBRegressor: random_state=43,LinearRegression
mse,0.02089,0.020884,0.018516,0.010714,0.016379,0.008611,0.008108,XGBRegressor: random_state=43,LinearRegression
mae,0.112202,0.112207,0.103214,0.079396,0.088895,0.066812,0.065601,XGBRegressor: random_state=43,Ridge
r2_score,0.979765,0.979771,0.982065,0.989622,0.984135,0.991659,0.992147,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014692,0.014693,0.013516,0.010383,0.01146,0.008642,0.008463,XGBRegressor: random_state=43,Ridge


## Ronda 7: imputaciones "ridge" (+ imputaciones "boxplot", logaritmo, asignación, borrado y escalado)
- Se imputan los "outliers" restantes de "depth (millimeters)" aplicando un modelo "Ridge" a "weight (carat)", "lenght (millimeters)" y "width (millimeters)", con las que tiene una altísima correlación

- Solo mejora "DecisionTree"

In [47]:
# Se hacen los cambios
df_diamonds_7 = df_diamonds.copy()

df_diamonds_7 = remove_all(df_diamonds_7)
df_diamonds_7 = assign_values(df_diamonds_7)

df_diamonds_7[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_diamonds_7[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_diamonds_7 = impute_boxplot_min_max(df_diamonds_7, ['depth (percentage)', 'table (percentage)'])

df_diamonds_7 = apply_ridge(df_diamonds_7)


In [48]:
# Se prueban los modelos
round_7 = Regression(df_diamonds_7, 'price')
round_7.split_dataframe(scaler='StandardScaler')
round_7.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_7.evaluate_metrics()
round_7.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.33 sec(s). Total time: 0.33
Starting Ridge:
- Ridge done in 0.16 sec(s). Total time: 0.49
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 5.21 sec(s). Total time: 5.7
Starting SVR:
- SVR done in 311.89 sec(s). Total time: 317.59
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.6 sec(s). Total time: 320.18
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 219.71 sec(s). Total time: 539.89
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 11.32 sec(s). Total time: 551.21


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144535,0.144514,0.136075,0.103527,0.126511,0.092819,0.09089,XGBRegressor: random_state=43,LinearRegression
mse,0.02089,0.020884,0.018516,0.010718,0.016005,0.008615,0.008261,XGBRegressor: random_state=43,LinearRegression
mae,0.112203,0.112209,0.103214,0.0794,0.088143,0.066837,0.065868,XGBRegressor: random_state=43,Ridge
r2_score,0.979765,0.979771,0.982064,0.989618,0.984497,0.991655,0.991998,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014692,0.014693,0.013516,0.010383,0.011364,0.008646,0.008505,XGBRegressor: random_state=43,Ridge


## Ronda 8: sustitución (+ borrado parcial y asignación)
- Se sustituyen los valores de "depth (percentage)" por los resultados reales que da el cálculo a partir de las columnas correspondientes

- Como se ha visto en EDA, el resultado serán muchos nuevos "outliers". Por tanto, se aplicará el cambio en el "dataframe" original sin otras modificaciones para comprobar si supone una mejora

- La única modificación adicional indispensable es el tratamiento de los 0 en esas columnas

- Para comprobar si realmente hay mejora, se aplican los modelos dos veces: una solo con las modificaciones de borrado parcial y asignación, y otra con la sustitución

- Mejoran "SVR" y "RandomForest"

In [22]:
# Se aplican los cambios
df_diamonds_8a = df_diamonds.copy()

df_diamonds_8a = assign_values(df_diamonds_8a)

df_diamonds_8a = remove_all(df_diamonds_8a, zeros_only=True)

df_diamonds_8b = df_diamonds_8a.copy()

df_diamonds_8b['depth (percentage)'] = (df_diamonds_8b['depth (millimeters)'] / ((df_diamonds_8b['lenght (millimeters)']+df_diamonds_8b['width (millimeters)']) / 2)) * 100


In [23]:
# Se ponen a prueba solo el borrado parcial y la asignación
round_8a = Regression(df_diamonds_8a, 'price')
round_8a.split_dataframe()
round_8a.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_8a.evaluate_metrics()
round_8a.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.31 sec(s). Total time: 0.31
Starting Ridge:
- Ridge done in 0.17 sec(s). Total time: 0.48
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 6.99 sec(s). Total time: 7.47
Starting SVR:
- SVR done in 408.32 sec(s). Total time: 415.79
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 3.09 sec(s). Total time: 418.87
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 287.86 sec(s). Total time: 706.73
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 14.35 sec(s). Total time: 721.08


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.154359,0.154318,0.178104,0.161348,0.12791,0.092733,0.09112,XGBRegressor: random_state=43,KNeighborsRegressor
mse,0.023827,0.023814,0.031721,0.026033,0.016361,0.008599,0.008303,XGBRegressor: random_state=43,KNeighborsRegressor
mae,0.115958,0.11595,0.133578,0.12324,0.088054,0.066104,0.065954,XGBRegressor: random_state=43,KNeighborsRegressor
r2_score,0.97703,0.977042,0.96942,0.974903,0.984227,0.99171,0.991996,XGBRegressor: random_state=43,SVR
mape,0.015024,0.015023,0.017619,0.016011,0.011365,0.00856,0.008513,XGBRegressor: random_state=43,KNeighborsRegressor


In [24]:
# Se pone a prueba la sustitución
round_8b = Regression(df_diamonds_8b, 'price')
round_8b.split_dataframe()
round_8b.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_8b.evaluate_metrics()
round_8b.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.39 sec(s). Total time: 0.39
Starting Ridge:
- Ridge done in 0.19 sec(s). Total time: 0.58
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 6.19 sec(s). Total time: 6.78
Starting SVR:
- SVR done in 465.98 sec(s). Total time: 472.75
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 3.64 sec(s). Total time: 476.4
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 327.77 sec(s). Total time: 804.17
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 18.2 sec(s). Total time: 822.36


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.155832,0.156025,0.178175,0.160862,0.128324,0.092704,0.091325,XGBRegressor: random_state=43,KNeighborsRegressor
mse,0.024284,0.024344,0.031746,0.025877,0.016467,0.008594,0.00834,XGBRegressor: random_state=43,KNeighborsRegressor
mae,0.1198,0.119931,0.133753,0.123338,0.088686,0.066106,0.066187,RandomForestRegressor: random_state=43,KNeighborsRegressor
r2_score,0.97659,0.976532,0.969395,0.975054,0.984125,0.991715,0.99196,XGBRegressor: random_state=43,SVR
mape,0.015685,0.015691,0.017646,0.016023,0.011451,0.008564,0.008551,XGBRegressor: random_state=43,KNeighborsRegressor


## Ronda 9: descarte
- Se quitan directamente las columnas con altísima correlación, cercana a 1 ("weight", "lenght", "width" y "depth (millimeters)")

- Se utiliza el "dataframe" sin cambios ni escalado para el contraste

- Todos empeoran respecto a la ronda 1

- Con el descarte, en su lugar, de las de correlación poco relevante, mejoran "Kneighbors" y "SVR"

In [16]:
# Se hace la prueba
df_diamonds_9a = df_diamonds.copy()

df_diamonds_9a = df_diamonds_9a.drop(columns=['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)'])

round_9a = Regression(df_diamonds_9a, 'price')
round_9a.split_dataframe()
round_9a.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_9a.evaluate_metrics()
round_9a.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.29 sec(s). Total time: 0.29
Starting Ridge:
- Ridge done in 0.12 sec(s). Total time: 0.41
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 2.37 sec(s). Total time: 2.78
Starting SVR:
- SVR done in 633.18 sec(s). Total time: 635.96
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 1.54 sec(s). Total time: 637.5
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 102.29 sec(s). Total time: 739.78
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 8.89 sec(s). Total time: 748.67


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.979343,0.979343,1.027563,0.986855,1.187047,1.025191,0.957208,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mse,0.959112,0.959112,1.055885,0.973882,1.40908,1.051016,0.916246,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mae,0.814501,0.814502,0.827875,0.80344,0.911736,0.816076,0.787496,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
r2_score,0.083879,0.08388,-0.008556,0.069771,-0.345919,-0.003905,0.124824,XGBRegressor: random_state=43,RandomForestRegressor: random_state=43
mape,0.107044,0.107044,0.108093,0.105367,0.1189,0.106677,0.103121,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43


In [17]:
# Para sacar más partido a esta ronda, se prueba también si se eliminan las columnas con una correlación próxima a 0
df_diamonds_9b = df_diamonds.copy()

df_diamonds_9b = df_diamonds_9b.drop(columns=['cut quality', 'depth (percentage)'])

round_9b = Regression(df_diamonds_9b, 'price')
round_9b.split_dataframe()
round_9b.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_9b.evaluate_metrics()
round_9b.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.28 sec(s). Total time: 0.28
Starting Ridge:
- Ridge done in 0.16 sec(s). Total time: 0.45
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 2.58 sec(s). Total time: 3.03
Starting SVR:
- SVR done in 339.54 sec(s). Total time: 342.57
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.35 sec(s). Total time: 344.92
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 192.48 sec(s). Total time: 537.4
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 9.3 sec(s). Total time: 546.7


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.225223,0.225136,0.145655,0.207183,0.133478,0.099942,0.095847,XGBRegressor: random_state=43,LinearRegression
mse,0.050725,0.050686,0.021215,0.042925,0.017816,0.009988,0.009187,XGBRegressor: random_state=43,LinearRegression
mae,0.131828,0.131875,0.102912,0.119493,0.091243,0.068562,0.067723,XGBRegressor: random_state=43,Ridge
r2_score,0.951548,0.951586,0.979736,0.958999,0.982982,0.990459,0.991225,XGBRegressor: random_state=43,SVR
mape,0.017005,0.017011,0.01358,0.015578,0.011747,0.008862,0.00874,XGBRegressor: random_state=43,Ridge


## Ronda 10: imputaciones "clarity quality"
- Se imputan los valores de 7 "clarity quality" al 6. Esto se hace porque se ha visto en el EDA que las variables relacionadas con el tamaño ('weight (carat)', 'lenght (millimeters)', 'width (millimeters)' y 'depth (millimeters)') dejan de disminuir a partir del 6

- Se utiliza el "dataframe" sin cambios ni escalado para el contraste

- Mejoran "LinearRegression", "Ridge" y "KNeighbors" comparados con la ronda 1

In [27]:
# Se comprueba
df_diamonds_10 = df_diamonds.copy()

df_diamonds_10.loc[df_diamonds_10['clarity quality'] == 7, 'clarity quality'] = 6

round_10 = Regression(df_diamonds_10, 'price')
round_10.split_dataframe()
round_10.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_10.evaluate_metrics()
round_10.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.4 sec(s). Total time: 0.4
Starting Ridge:
- Ridge done in 0.17 sec(s). Total time: 0.57
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 8.32 sec(s). Total time: 8.89
Starting SVR:
- SVR done in 529.65 sec(s). Total time: 538.54
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 3.34 sec(s). Total time: 541.88
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 382.31 sec(s). Total time: 924.18
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 50.09 sec(s). Total time: 974.28


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.221734,0.221619,0.183265,0.209357,0.130545,0.099294,0.094895,XGBRegressor: random_state=43,LinearRegression
mse,0.049166,0.049115,0.033586,0.04383,0.017042,0.009859,0.009005,XGBRegressor: random_state=43,LinearRegression
mae,0.122548,0.122607,0.135284,0.127694,0.08999,0.068342,0.067275,XGBRegressor: random_state=43,KNeighborsRegressor
r2_score,0.953038,0.953086,0.967919,0.958134,0.983722,0.990583,0.991399,XGBRegressor: random_state=43,SVR
mape,0.015838,0.015845,0.017839,0.016556,0.011599,0.008856,0.008688,XGBRegressor: random_state=43,KNeighborsRegressor


# Modelaje: selección de modelos

- Cada uno de los modelos mejora con los siguientes cambios:

---------- LinearRegression (mejor: 0.144352, ronda 4) ----------

 · Borrado

 · Asignación

 · Imputaciones "boxplot"
 
 · Imputaciones "clarity quality"

---------- Ridge (mejor: 0.144347, ronda 4) ----------

· Escalado "MinMax"

· Borrado

· Asignación

· Imputaciones "boxplot"

· Imputaciones "clarity quality"

---------- KNeighborsRegressor (mejor: 0.136072, ronda 6) ----------

· Escalado "Standard"

· Borrado

· Asignación

· Logaritmo

· Imputaciones "boxplot"

· Descarte correlación ínfima

· Imputaciones "clarity quality"

---------- SVR (mejor: 0.103508, ronda 6) ----------

· Escalado "Standard"

· Borrado

· Asignación

· Logaritmo

· Imputaciones "boxplot"

· Sustitución

· Descarte correlación ínfima

---------- DecisionTree (mejor: 0.126511, ronda 7) ----------

· Borrado

· Logaritmo

· Imputaciones "ridge"

---------- RandomForest (mejor: 0.092534, ronda 4) ----------

· Borrado

· Asignación

· Sustitución

---------- XGBRegressor (mejor: 0.090042, ronda 6) ----------

· Esclado "Standard"

· Borrado

· Asignación

· Imputaciones "boxplot"


## "Ridge" y "LinearRegression"

- Cambios: escalado "MinMax", borrado, asignación, imputaciones "boxplot" e imputaciones "clarity quality"

- Como mejoran con los mismos cambios y "LinearRegression" no se ve afectada por los escalados, se prueban juntos

- Los mejores resultados para estos modelos no bajan de 0.14

- Con cambios e hiperparámetros, "LinearRegression" ha mejorado su "rmse" de 0.222038 a 0.143639

- Con cambios e hiperparámetros, "Ridge" ha mejorado su "rmse" de 0.221923 a 0.144060


In [6]:
# Se crea el "scorer"
rmse = make_scorer(calculate_rmse, greater_is_better=False)


In [25]:
# Se aplican todos los cambios del listado en común
df_ridge_linear = df_diamonds.copy()

df_ridge_linear = remove_all(df_ridge_linear)

df_ridge_linear = assign_values(df_ridge_linear)

df_ridge_linear = impute_boxplot_min_max(df_ridge_linear, ['depth (percentage)', 'table (percentage)'])

df_ridge_linear.loc[df_ridge_linear['clarity quality'] == 7, 'clarity quality'] = 6


In [26]:
# Se prueban en un "dataframe" conjunto
ridge_linear = Regression(df_ridge_linear, 'price')
X_train, X_test, y_train, y_test = ridge_linear.split_dataframe(scaler='MinMaxScaler')
ridge_linear.apply_models(selected_list=['LinearRegression', 'Ridge'],
                        kfolds_num=10
                    )
ridge_linear.evaluate_metrics()
ridge_linear.create_dataframe()


-- Regression (MinMaxScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.34 sec(s). Total time: 0.34
Starting Ridge:
- Ridge done in 0.18 sec(s). Total time: 0.52


Unnamed: 0,LinearRegression,Ridge,BEST,WORST
rmse,0.143636,0.14415,LinearRegression,Ridge
mse,0.020631,0.020779,LinearRegression,Ridge
mae,0.111701,0.112225,LinearRegression,Ridge
r2_score,0.980016,0.979873,LinearRegression,Ridge
mape,0.014615,0.014684,LinearRegression,Ridge


In [27]:
# Se busca la mejor media de 5 "folds" para la regresión lineal modificando los hiperparámetros
# Se confirma que el mejor modelo para regresión lineal es el que tiene los valores por defecto
model = LinearRegression()

params = {'fit_intercept': [True, False],
            'positive': [True, False],
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END fit_intercept=True, positive=True;, score=-0.184 total time=   0.0s
[CV 2/5] END fit_intercept=True, positive=True;, score=-0.187 total time=   0.0s
[CV 3/5] END fit_intercept=True, positive=True;, score=-0.177 total time=   0.0s
[CV 4/5] END fit_intercept=True, positive=True;, score=-0.184 total time=   0.0s
[CV 5/5] END fit_intercept=True, positive=True;, score=-0.174 total time=   0.0s
[CV 1/5] END fit_intercept=True, positive=False;, score=-0.149 total time=   0.0s
[CV 2/5] END fit_intercept=True, positive=False;, score=-0.149 total time=   0.0s
[CV 3/5] END fit_intercept=True, positive=False;, score=-0.143 total time=   0.0s
[CV 4/5] END fit_intercept=True, positive=False;, score=-0.148 total time=   0.0s
[CV 5/5] END fit_intercept=True, positive=False;, score=-0.144 total time=   0.0s
[CV 1/5] END fit_intercept=False, positive=True;, score=-0.361 total time=   0.0s
[CV 2/5] END fit_intercept=False, positive=

In [28]:
# Mejor puntuación para regresión lineal
mean_squared_error(y_test, y_pred, squared=False)


0.14363969914101388

In [29]:
# Se intenta mejorar el "ridge" modificando los hiperparámetros
# Se obtiene el mejor modelo para "ridge"
model = Ridge()

params = {'fit_intercept': [True, False],
            'alpha': np.linspace(1, 100, num=100),
            'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
            'positive': [True, False]

            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 2800 candidates, totalling 14000 fits
[CV 1/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 2/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 3/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 4/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 5/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 1/5] END alpha=1.0, fit_intercept=True, positive=True, solver=cholesky;, score=nan total time=   0.0s
[CV 2/5] END alpha=1.0, fit_intercept=True, positive=True, solver=cholesky;, score=nan total time=   0.0s
[CV 3/5] END alpha=1.0, fit_intercept=True, positive=True, solver=cholesky;, score=nan total time=   0.0s
[CV 4/5] END alpha=1.0, fit_intercept=True, positive=True, solver=cholesky;, score=nan total time=   0.0s
[CV 5

7000 fits failed out of a total of 14000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1000 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Roger\AppData\Local\miniconda3\envs\data_analytics\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Roger\AppData\Local\miniconda3\envs\data_analytics\lib\site-packages\sklearn\linear_model\_ridge.py", line 1134, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "c:\Users\Roger\AppData\Local\miniconda3\envs\data_analytics\lib\site-packages\sklearn\linear_model\_ridge.py", line 832, in fit
    raise ValueError(
ValueError: solver='s

In [30]:
# Mejor puntuación para "ridge"
mean_squared_error(y_test, y_pred, squared=False)

0.1440602916434586

## PENDIENTE "KNeighborsRegressor"

- Cambios: escalado "Standard", borrado, asignación, logaritmo, imputaciones "boxplot", descarte correlación ínfima e imputaciones "clarity quality"

- Como no se puede aplicar el logaritmo si hay ceros, se hace una asignación parcial que no incluye los "outliers"

- Con cambios e hiperparámetros, k vecinos ha mejorado su "rmse" de 0.183865 a 


In [31]:
df_neighbors = df_diamonds.copy()

df_neighbors = remove_all(df_neighbors)

df_neighbors = assign_values(df_neighbors)

df_neighbors[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_neighbors[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_neighbors = impute_boxplot_min_max(df_neighbors, ['depth (percentage)', 'table (percentage)'])

df_neighbors = df_neighbors.drop(columns=['cut quality', 'depth (percentage)'])

df_neighbors.loc[df_neighbors['clarity quality'] == 7, 'clarity quality'] = 6


In [32]:
neighbors = Regression(df_neighbors, 'price')
X_train, X_test, y_train, y_test = neighbors.split_dataframe(scaler='StandardScaler')
neighbors.apply_models(selected_list=['KNeighborsRegressor'],
                        kfolds_num=10
                    )
neighbors.evaluate_metrics()
neighbors.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 2.57 sec(s). Total time: 2.57


Unnamed: 0,KNeighborsRegressor,BEST,WORST
mae,0.09324,KNeighborsRegressor,KNeighborsRegressor
mape,0.012333,KNeighborsRegressor,KNeighborsRegressor
mse,0.015091,KNeighborsRegressor,KNeighborsRegressor
r2_score,0.985383,KNeighborsRegressor,KNeighborsRegressor
rmse,0.122844,KNeighborsRegressor,KNeighborsRegressor


In [18]:
# Se usa el "grid"
model = KNeighborsRegressor()

params = {'n_neighbors': range(5, 21),
            'algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'leaf_size': range(20, 41),
            'metric': ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'],
            'n_jobs': [-1],
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 5040 candidates, totalling 25200 fits
[CV 1/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.124 total time=   0.4s
[CV 2/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.122 total time=   0.4s
[CV 3/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.124 total time=   0.3s
[CV 4/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.122 total time=   0.3s
[CV 5/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.123 total time=   0.3s
[CV 1/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=6;, score=-0.123 total time=   0.2s
[CV 2/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=6;, score=-0.121 total time=   0.3s
[CV 3/5] END algorithm=ball_tree, leaf_size=20, metric=city

KeyboardInterrupt: 

In [None]:
# Mejor puntuación para k vecinos
mean_squared_error(y_test, y_pred, squared=False)


## "SVR"

- Cambios: escalado "Standard", borrado, asignación, logaritmo, imputaciones "boxplot", sustitución y descarte correlación ínfima

- Con cambios e hiperparámetros, "SVR" ha mejorado su "rmse" de 0.209064 a


In [6]:
df_svr = df_diamonds.copy()

df_svr = remove_all(df_svr)

df_svr = assign_values(df_svr)

df_svr[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_svr[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_svr = impute_boxplot_min_max(df_svr, ['depth (percentage)', 'table (percentage)'])

df_svr['depth (percentage)'] = (df_svr['depth (millimeters)'] / ((df_svr['lenght (millimeters)']+df_svr['width (millimeters)']) / 2)) * 100

df_svr = df_svr.drop(columns=['cut quality', 'depth (percentage)'])


In [7]:
svr = Regression(df_svr, 'price')
X_train, X_test, y_train, y_test = svr.split_dataframe(scaler='StandardScaler')
svr.apply_models(selected_list=['SVR'],
                        kfolds_num=10
                    )
svr.evaluate_metrics()
svr.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting SVR:
- SVR done in 607.84 sec(s). Total time: 607.84


Unnamed: 0,SVR,BEST,WORST
mae,0.767242,SVR,SVR
mape,0.099959,SVR,SVR
mse,0.936805,SVR,SVR
r2_score,0.092578,SVR,SVR
rmse,0.967887,SVR,SVR


In [9]:
# Como es lento, se empieza por probar solo los diferentes "kernels"
model = SVR()

params = {'kernel': ['linear', 'poly', 'rbf']
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4,
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ....................kernel=linear;, score=-0.994 total time=  53.1s
[CV 2/5] END ....................kernel=linear;, score=-0.995 total time=  48.8s
[CV 3/5] END ....................kernel=linear;, score=-0.965 total time= 1.1min
[CV 4/5] END ....................kernel=linear;, score=-0.974 total time=  58.9s
[CV 5/5] END ....................kernel=linear;, score=-0.975 total time=  46.5s
[CV 1/5] END ......................kernel=poly;, score=-0.995 total time= 1.8min
[CV 2/5] END ......................kernel=poly;, score=-0.999 total time=19.9min
[CV 3/5] END ......................kernel=poly;, score=-0.974 total time=23.9min
[CV 4/5] END ......................kernel=poly;, score=-0.982 total time=18.9min
[CV 5/5] END ......................kernel=poly;, score=-0.985 total time= 1.4min
[CV 1/5] END .......................kernel=rbf;, score=-0.977 total time=  51.0s
[CV 2/5] END .......................kernel=rbf;, 

In [10]:
# "rbf" es el mejor y es bastante rápido. "poly" es tan lento que es inviable iterar con él
# Se prueba con varios "gamma" superiores al estándar
model = SVR()

params = {'kernel': ['rbf'],
            'gamma': range(10, 41),
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4,
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 31 candidates, totalling 155 fits
[CV 1/5] END .............gamma=10, kernel=rbf;, score=-1.037 total time= 1.4min
[CV 2/5] END .............gamma=10, kernel=rbf;, score=-1.044 total time= 1.4min
[CV 3/5] END .............gamma=10, kernel=rbf;, score=-1.018 total time= 1.4min
[CV 4/5] END .............gamma=10, kernel=rbf;, score=-1.015 total time= 1.4min
[CV 5/5] END .............gamma=10, kernel=rbf;, score=-1.014 total time= 1.4min
[CV 1/5] END .............gamma=11, kernel=rbf;, score=-1.038 total time= 1.4min
[CV 2/5] END .............gamma=11, kernel=rbf;, score=-1.046 total time= 1.4min
[CV 3/5] END .............gamma=11, kernel=rbf;, score=-1.019 total time= 1.4min
[CV 4/5] END .............gamma=11, kernel=rbf;, score=-1.016 total time= 1.4min
[CV 5/5] END .............gamma=11, kernel=rbf;, score=-1.015 total time= 1.4min
[CV 1/5] END .............gamma=12, kernel=rbf;, score=-1.039 total time= 1.4min
[CV 2/5] END .............gamma=12, kernel=rbf;

In [13]:
# Se prueba, también, con varios gamma inferiores al estándar. Este es el mejor "SVR" obtenido
model = SVR()

params = {'kernel': ['rbf'],
            'gamma': [0.0001, 0.0003, 0.0005, 0.0007, 0.001, 0.003, 0.005, 0.007, 0.01, 0.03, 0.05, 0.07, 0.1, 0.3, 0.5, 0.7, 1, 3 , 5, 7],
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4,
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END .........gamma=0.0001, kernel=rbf;, score=-0.986 total time=  53.3s
[CV 2/5] END .........gamma=0.0001, kernel=rbf;, score=-0.988 total time=  53.7s
[CV 3/5] END .........gamma=0.0001, kernel=rbf;, score=-0.959 total time=  56.0s
[CV 4/5] END .........gamma=0.0001, kernel=rbf;, score=-0.968 total time=  56.0s
[CV 5/5] END .........gamma=0.0001, kernel=rbf;, score=-0.969 total time=  55.1s
[CV 1/5] END .........gamma=0.0003, kernel=rbf;, score=-0.990 total time=  54.6s
[CV 2/5] END .........gamma=0.0003, kernel=rbf;, score=-0.992 total time=  54.6s
[CV 3/5] END .........gamma=0.0003, kernel=rbf;, score=-0.962 total time=  54.2s
[CV 4/5] END .........gamma=0.0003, kernel=rbf;, score=-0.971 total time=  54.7s
[CV 5/5] END .........gamma=0.0003, kernel=rbf;, score=-0.972 total time=  54.5s
[CV 1/5] END .........gamma=0.0005, kernel=rbf;, score=-0.991 total time=  54.3s
[CV 2/5] END .........gamma=0.0005, kernel=rbf;

In [None]:
# Mejor puntuación para "SVR"
mean_squared_error(y_test, y_pred, squared=False)


## "DecisionTree"

- Cambios: borrado, logaritmo, imputaciones "ridge" y descarte correlación ínfima

- Como hay que aplicar el logaritmo y la asignación no mejora los resultados, se usa una asignación parcial que imputa los valores 0 pero no afecta a los "outliers"

- Con cambios e hiperparámetros, "DecisionTree" ha mejorado su "rmse" de 0.130142 a


In [6]:
df_tree = df_diamonds.copy()

df_tree = remove_all(df_tree)

df_tree = assign_values(df_tree, outlier=False)

df_tree[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_tree[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_tree = apply_ridge(df_tree)

df_tree = df_tree.drop(columns=['cut quality', 'depth (percentage)'])


In [None]:
tree = Regression(df_tree, 'price')
X_train, X_test, y_train, y_test = tree.split_dataframe()
tree.apply_models(selected_list=['DecisionTreeRegressor'],
                    params_list=[['DecisionTreeRegressor', 'random_state=43']],
                        kfolds_num=10
                    )
tree.evaluate_metrics()
tree.create_dataframe()


In [None]:
# Se usa el "grid"
# En una segunda vuelta se miran los hiperparámentros "max_depth" y "min_samples_split", que están relacionados
model = DecisionTreeRegressor()

params = {'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'splitter': ['best', 'random'],
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_leaf': range(1, 11)
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


In [None]:
# Mejor puntuación para "DecisionTree"
mean_squared_error(y_test, y_pred, squared=False)

## "RandomForest"

- Cambios: borrado, asignación y sustitución

- Con cambios e hiperparámetros, "RandomForest" ha mejorado su "rmse" de 0.098101 a


In [None]:
df_forest = df_diamonds.copy()

df_forest = remove_all(df_forest)

df_forest = assign_values(df_forest)

df_forest['depth (percentage)'] = (df_forest['depth (millimeters)'] / ((df_forest['lenght (millimeters)']+df_forest['width (millimeters)']) / 2)) * 100


In [None]:
forest = Regression(df_forest, 'price')
X_train, X_test, y_train, y_test = forest.split_dataframe()
forest.apply_models(selected_list=['RandomForestRegressor'],
                    params_list=[['RandomForestRegressor', 'random_state=43']],
                        kfolds_num=10
                    )
forest.evaluate_metrics()
forest.create_dataframe()


In [None]:
# En una segunda vuelta se miran los hiperparámentros "max_depth" y "min_samples_split", que están relacionados
# En una tercera vuelta se prueba con "boolstrap=True"
model = RandomForestRegressor()

params = {'criterion': ['gini', 'entropy', 'log_loss'],
            'max_features': ['sqrt', 'log2', None],
            'min_samples_leaf': range(1, 11),
            'oob_score': [True, False]
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


In [None]:
# Mejor puntuación para "RandomForest"
mean_squared_error(y_test, y_pred, squared=False)

## "XGBRegressor"

- Cambios: escalado "standard", borrado, asignación e imputaciones "boxplot"

- Con cambios e hiperparámetros, "XGBRegressor" ha mejorado su "rmse" de 0.094787 a


In [None]:
df_xgb = df_diamonds.copy()

df_xgb = remove_all(df_xgb)

df_xgb = assign_values(df_xgb)

df_xgb = impute_boxplot_min_max(df_xgb, ['depth (percentage)', 'table (percentage)'])


In [None]:
forest = Regression(df_forest, 'price')
X_train, X_test, y_train, y_test = forest.split_dataframe()
forest.apply_models(selected_list=['XGBRegressor'],
                    params_list=[['XGBRegressor', 'random_state=43']],
                        kfolds_num=10
                    )
forest.evaluate_metrics()
forest.create_dataframe()


In [None]:
model = XGBRegressor()

params = {
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


In [None]:
# Mejor puntuación para "XGBRegressor"
mean_squared_error(y_test, y_pred, squared=False)