In [11]:
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import make_scorer

from mlxtend.plotting import plot_decision_regions

if str(os.getcwdb()[-3:]).split("'")[1] != 'src':
    os.chdir(os.path.dirname(os.getcwdb()))

from utils.modeling import *
from utils.functions import *


In [12]:
df_diamonds = pd.read_csv(r'data\processed\diamonds_training.csv', index_col='id')
df_predict = pd.read_csv(r'data\processed\diamonds_testing.csv', index_col='id')


# Consideraciones
- Se localiza el tema en Kaggle: https://www.kaggle.com/datasets/shivam2503/diamonds

- Se empieza a trabajar con ese "dataset" (ver los archivos marcados como "UNUSED")

- Se detecta que existe una competición, si bien ya ha terminado: https://www.kaggle.com/competitions/diamonds-part-datamad0122/overview

- Se elige trabajar con los archivos de la competición, cuyas únicas diferencias es que hay un "train" y un "test", y que la variable "target" está escalada

- El "dataset" final es un listado de diamantes con sus características, y el objetivo es predecir el precio

- Se comparará lo obtenido con los resultados de la competición

# EDA
- Los pasos de esta primera parte se detallan de forma más pormenorizada, paso a paso, en el "notebook" titulado "EDA"

- En ese "notebook" se hacen dos cosas:
1) Modificaciones esenciales (limpieza); se liquidan duplicados, se cambia el nombre de las columnas y se pasan las categóricas a numéricas, tanto del "train" como del "test".

2) Modificaciones opcionales (feature engineering); se detectan y ponen a prueba las posibles modifiaciones que llevar a cabo con el "dataframe" de entrenamiento con tal de mejorar el resultado de los modelos. Los resortes de dichos cambios se guardan en forma de funciones (cuando son exclusivos de este proyecto) o clases (cuando es razonable guardarlos para análisis futuros), que se irán llamando a continuación según convenga.

# Modelaje: selección de cambios
- Se importan los "dataframes" con las modificaciones esenciales

- Se van intercalando modificiaciones opcionales y diversos modelos hasta dar con el mejor resultado

- Los modelos se prueban en este "notebook" para mayor comodidad, pero se ejecutan sin detallarse en "train.py", desde donde se guardan en la carpeta "model"

- Estas son las modificaciones que se van intercalando:

---------- Cambios opcionales (probados) ---------- 

1) Borrado de "outliers" extremadamente altos ("depth (percentage)", "table (percentage)", "width (millimeters)", "depth (millimeters)").

2) Borrado de filas que tienen 0 en todas las variables de tamaño ("lenght (millimeters)", "width (millimeters)" y "depth (millimeters)").

3) Borrado de los "outliers" compartidos moderadamente altos ("depth (percentage)" y "table (percentage)").

4) Asignación del valor con 0 restante en "lenght" al "width" correspondiente ("lenght (millimeters)").

5) Asignación del valor con 0 restante de "depth (millimeters)" a partir de una operación con el "lenght", el "width" y el "depth (percentage)" correspondientes ("depth (millimeters)").

6) Asignación del "outlier" restante del "lenght" al "width" correspondiente ("lenght (millimeters)").

7) Uso del logaritmo ("weight (carat)", "lenght (millimeters)", "width (millimeters)" y "depth (millimeters)").

8) Imputación al siguiente valor más alto ("weight (carat)").

9) Imputación a los valores máximos y mínimos del "boxplot" ("depth (percentage)" y "table (percentage)").

10) Neutralización de "outliers" con un modelo "ridge" ("depth (millimeters)").

11) Escalado "MinMax".

---------- Cambios apuntados (no probados) ----------

1) Sustitución de valores existentes por valores calculados ("depth (percentage)").

2) Descarte de las columnas con altísima correlación ("weight (carat)", "lenght (millimeters)", "width (millimeters)" y "depth (millimeters)").

3) Imputación de los valores máximos de "clarity quality" al que está un punto por debajo ("clarity quality").


## Ronda 1: sin cambios
- Para la primera fase, se prueban todos los modelos sin hacer ninguna modificación adicional

- En esta primera ronda están más detallados los usos de la clase "Regression", que hereda de "Model", para que sirva como ejemplo

- Como era de esperar, los resultados no son demasiado buenos, pero ganan los modelos "de árboles", ya que no se ven afectados por los valores atípicos

- Dado que en la competición se valora la "rsme", esa es la métrica que más se tendrá en cuenta. Puede verse el podio aquí: https://www.kaggle.com/competitions/diamonds-part-datamad0122/leaderboard

In [13]:
# Lo primero es decirle a la clase con qué modelos se va a trabajar a lo largo de todo el proceso
Regression.add_models(['LinearRegression',
                        'Ridge',
                        'DecisionTreeRegressor',
                        'KNeighborsRegressor',
                        'RandomForestRegressor',
                        'SVR',
                        'XGBRegressor'
                        ]
                        )


In [14]:
# Se crea la instancia de la clase "Regression" con la columna "price" como "target"
round_1 = Regression(df_diamonds, 'price')


In [15]:
# Se separa el "dataframe" con los parámetros por defecto. Se guardan las porciones por si acaso
X_train, X_test, y_train, y_test = round_1.split_dataframe()


In [16]:
# Se establece un "random_state" para los modelos que lo requieren, que siempre será el mismo
round_1.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )


'Models prepared. Apply them or use kfold (apply + evaluate)'

In [17]:
# Se solicitan 5 "folds" (número por defecto), del cual se usará el mejor para entrenar los modelos y ver cuál llega más lejos
# Como la "target" es de regresión y esa es la clase que se ha instanciado, se seleccionará automáticamente "KFold" en lugar de "StratifiedKFold"
round_1_dict = round_1.apply_and_evaluate_kfolds()


In [None]:
# La media y la varianza de las métricas, así como las específicas de cada "fold" entrenado y el modelo en sí, pueden extraerse en un diccionario
round_1_dict


{'LinearRegression': {'models': [LinearRegression(),
   LinearRegression(),
   LinearRegression(),
   LinearRegression(),
   LinearRegression()],
  'metrics': {'rmse': 0.18266434126360717,
   'mse': 0.033668866217605245,
   'mae': 0.11847602510487627,
   'r2_score': 0.9671883402028569,
   'mape': 0.015316754357796062},
  'all_metrics': {'rmse': [0.18278436404836218,
    0.16134131610876856,
    0.21213340614001955,
    0.1873559277102721,
    0.16970669231061353],
   'mse': [0.0334101237405642,
    0.026031020283709582,
    0.04500058200056649,
    0.035102243648176705,
    0.02880036141500925],
   'mae': [0.1198527764880509,
    0.12009375582158771,
    0.11814603052977636,
    0.1176840754805362,
    0.1166034872044301],
   'r2_score': [0.9675277226414865,
    0.9749723070956142,
    0.9554574304863221,
    0.9657327540523,
    0.9722514867385628],
   'mape': [0.015509386119075434,
    0.015492198456540348,
    0.015267110508885734,
    0.015208411444627408,
    0.015106665259851387]

In [None]:
# Para una mejor visualización, se ponen esas medias en un "dataframe"
# Las predicciones no son muy buenas, si bien el r2_score es alto para todos los casos
round_1.create_dataframe()


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.182664,0.182826,0.181632,0.189328,0.13048,0.094246,0.090762,XGBRegressor: random_state=43,SVR
mse,0.033669,0.033718,0.033003,0.036192,0.017029,0.008898,0.008247,XGBRegressor: random_state=43,SVR
mae,0.118476,0.118554,0.135741,0.126174,0.090791,0.066789,0.065596,XGBRegressor: random_state=43,KNeighborsRegressor
r2_score,0.967188,0.967141,0.967889,0.964745,0.983436,0.99134,0.991976,XGBRegressor: random_state=43,SVR
mape,0.015317,0.015327,0.017877,0.016342,0.011706,0.008636,0.008468,XGBRegressor: random_state=43,KNeighborsRegressor


## Ronda 2: escalado
- Se repite la ronda 1, pero esta vez se escalan las variables

- Exceptuando "Ridge", el escalado "Standard" mejora más que "MinMax" los resultados de los modelos "no de árboles", que solo empeoran. La regresión lineal no se ve afectada en ningún caso

In [None]:
# Se pone a prueba con el mismo proceso que en la ronda anterior, pero esta vez se aplica un escalado MinMax
# Hay una ligera mejora en los modelos "no de árboles"
df_diamonds_2 = df_diamonds.copy()

round_2 = Regression(df_diamonds_2, 'price')
round_2.split_dataframe(scaler='MinMaxScaler')
round_2.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ])
round_2.apply_and_evaluate_kfolds()
round_2.create_dataframe()


-- Regression (MinMaxScaler): using mean of 5 folds --
Starting LinearRegression:
- LinearRegression done in 0.16 sec(s). Total time: 0.16
Starting Ridge:
- Ridge done in 0.09 sec(s). Total time: 0.25
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 2.21 sec(s). Total time: 2.46
Starting SVR:
- SVR done in 165.35 sec(s). Total time: 167.81
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 1.64 sec(s). Total time: 169.44
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 141.46 sec(s). Total time: 310.9
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 11.52 sec(s). Total time: 322.42


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.182664,0.181262,0.164514,0.121829,0.13076,0.094312,0.090735,XGBRegressor: random_state=43,LinearRegression
mse,0.033669,0.033098,0.027074,0.01486,0.017102,0.00891,0.008242,XGBRegressor: random_state=43,LinearRegression
mae,0.118476,0.124283,0.116946,0.091139,0.090951,0.06686,0.065592,XGBRegressor: random_state=43,Ridge
r2_score,0.967188,0.967752,0.973661,0.98554,0.983365,0.991329,0.991981,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.015317,0.016057,0.015443,0.012015,0.011723,0.008645,0.008468,XGBRegressor: random_state=43,Ridge


In [None]:
# Comprobamos si la situación mejora con "StandardScaler". En efecto, es así (salvo para Ridge, que empeora)
df_diamonds_2b = df_diamonds.copy()

round_2b = Regression(df_diamonds_2b, 'price')
round_2b.split_dataframe(scaler='StandardScaler')
round_2b.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_2b.apply_and_evaluate_kfolds()
round_2b.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.26 sec(s). Total time: 0.26
Starting Ridge:
- Ridge done in 0.15 sec(s). Total time: 0.41
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.61 sec(s). Total time: 5.02
Starting SVR:
- SVR done in 288.83 sec(s). Total time: 293.85
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.65 sec(s). Total time: 296.5
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 221.11 sec(s). Total time: 517.61
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 10.87 sec(s). Total time: 528.48


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.222038,0.221959,0.14493,0.110613,0.131137,0.098193,0.093434,XGBRegressor: random_state=43,LinearRegression
mse,0.049301,0.049266,0.021005,0.012235,0.017197,0.009642,0.00873,XGBRegressor: random_state=43,LinearRegression
mae,0.1229,0.122939,0.105255,0.082899,0.088844,0.067219,0.066317,XGBRegressor: random_state=43,Ridge
r2_score,0.952909,0.952942,0.979937,0.988313,0.983574,0.99079,0.991661,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.015873,0.015878,0.013887,0.010879,0.011435,0.008697,0.008558,XGBRegressor: random_state=43,Ridge


## Ronda 3: borrado (+ escalado)
- Se borran todos los "outliers" extremadamente altos ("depth (percentage)", "table (percentage)", "width (millimeters)", "depth (millimeters)")

- Se borran las filas que tienen el mismo cero en estas tres columnas: "lenght (millimeters)", "width (millimeters)" y "depth (millimeters)"

- Se borran los "outliers" moderadamente altos compartidos de "depth (percentage)" y "table (percentage)"

- El error mejora en todos los modelos en distintas medidas

In [None]:
# Se aplican los borrados, esta vez con una función a medida ya que son cambios exclusivos de este proyecto
df_diamonds_3 = df_diamonds.copy()

df_diamonds_3 = remove_all(df_diamonds_3)

print(f'Deleted rows: {len(df_diamonds) - len(df_diamonds_3)}')


Deleted rows: 20


In [None]:
# Se escala y se prueban los modelos
round_3 = Regression(df_diamonds_3, 'price')
round_3.split_dataframe(scaler='StandardScaler')
round_3.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_3.apply_and_evaluate_kfolds()
round_3.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.25 sec(s). Total time: 0.25
Starting Ridge:
- Ridge done in 0.13 sec(s). Total time: 0.38
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.49 sec(s). Total time: 4.87
Starting SVR:
- SVR done in 329.21 sec(s). Total time: 334.08
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.99 sec(s). Total time: 337.07
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 241.02 sec(s). Total time: 578.09
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 11.09 sec(s). Total time: 589.18


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.145141,0.145126,0.139616,0.105012,0.126934,0.092578,0.090632,XGBRegressor: random_state=43,LinearRegression
mse,0.021066,0.021062,0.019493,0.011027,0.016112,0.008571,0.008214,XGBRegressor: random_state=43,LinearRegression
mae,0.112924,0.112917,0.105371,0.08104,0.088498,0.066686,0.065999,XGBRegressor: random_state=43,LinearRegression
r2_score,0.979595,0.979599,0.981119,0.989318,0.984393,0.991698,0.992043,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014819,0.014817,0.013891,0.010642,0.0114,0.008626,0.008518,XGBRegressor: random_state=43,LinearRegression


## Ronda 4: asignación (+ borrado y escalado)
- Los cambios de esta ronda se aplican a partir de dos hechos probados durante el EDA: 

1) El "lenght" y el "width" son generalmente casi idénticos, ya que los diamantes son semicirculares.

2) El "depth (percentage)" se obtiene (según el autor del "dataset") de dividir "depth (millimeters)" por la media de "lenght" y "width".

- Se asigna el 0 restante de "lenght" al "width" correspondiente ("lenght (millimeters)")

- Se asigna el 0 restante de "depth (millimeters)" a partir de la operación mencionada

- Se asigna el "outlier" restante del "lenght" al "width" correspondiente ("lenght (millimeters)")

- Mejoran todos menos "DecisionTree" en pequeña medida 

In [None]:
# Se aplican el borrado y la asignación
df_diamonds_4 = df_diamonds.copy()

df_diamonds_4 = remove_all(df_diamonds_4)
df_diamonds_4 = assign_values(df_diamonds_4)


In [None]:
# Se escala y se prueban los modelos
round_4 = Regression(df_diamonds_4, 'price')
round_4.split_dataframe(scaler='StandardScaler')
round_4.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_4.apply_and_evaluate_kfolds()
round_4.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
[-0.11653227 -0.11316044 -0.11366237 -0.11369523 -0.11087951 -0.11745744
 -0.11233688 -0.10954778 -0.11380371 -0.11242902]
[LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression(), LinearRegression()]
- LinearRegression done in 0.3 sec(s). Total time: 0.3
Starting Ridge:
[-0.11653583 -0.11315017 -0.11366404 -0.11368479 -0.11088681 -0.11745587
 -0.11232847 -0.10955238 -0.11380434 -0.11241987]
[Ridge(), Ridge(), Ridge(), Ridge(), Ridge(), Ridge(), Ridge(), Ridge(), Ridge(), Ridge()]
- Ridge done in 0.17 sec(s). Total time: 0.47
Starting KNeighborsRegressor:
[-0.10458707 -0.10616856 -0.10357902 -0.10418933 -0.10256565 -0.10497194
 -0.1047289  -0.10327145 -0.10377428 -0.10357519]
[KNeighborsRegressor(), KNeighborsRegressor(), KNeighborsRegressor(), KNeighborsRegressor(), KNeighborsRegr

Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144383,0.144377,0.139194,0.104925,0.12851,0.092583,0.09033,XGBRegressor: random_state=43,LinearRegression
mse,0.020846,0.020845,0.019375,0.011009,0.016515,0.008572,0.00816,XGBRegressor: random_state=43,LinearRegression
mae,0.112107,0.112106,0.105242,0.081012,0.089531,0.06665,0.065599,XGBRegressor: random_state=43,LinearRegression
r2_score,0.979807,0.979809,0.981233,0.989336,0.984003,0.991697,0.992096,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014665,0.014665,0.013875,0.010637,0.011524,0.008623,0.00847,XGBRegressor: random_state=43,Ridge


## Ronda 5: logaritmo (+ asignación, borrado y escalado)
- Se aplica el logaritmo a las columnas "weight (carat)", "lenght (millimeters)", "width (millimeters)" y "depth (millimeters)"

- Mejoran "KNeighbors",  "SVR" y "DecisionTree" (este último, muy poco). "XGBRegressor" se queda igual. El resto, empeoran

In [None]:
# Se hacen los retoques
df_diamonds_5 = df_diamonds.copy()

df_diamonds_5 = remove_all(df_diamonds_5)
df_diamonds_5 = assign_values(df_diamonds_5)

df_diamonds_5[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_diamonds_5[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])


In [None]:
# Se escala y se prueban los modelos
round_5 = Regression(df_diamonds_5, 'price')
round_5.split_dataframe(scaler='StandardScaler')
round_5.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_5.apply_and_evaluate_kfolds()
round_5.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.25 sec(s). Total time: 0.25
Starting Ridge:
- Ridge done in 0.14 sec(s). Total time: 0.39
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.8 sec(s). Total time: 5.2
Starting SVR:
- SVR done in 282.55 sec(s). Total time: 287.75
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.65 sec(s). Total time: 290.39
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 223.29 sec(s). Total time: 513.69
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 11.53 sec(s). Total time: 525.22


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144715,0.144684,0.13766,0.103558,0.128335,0.092644,0.09034,XGBRegressor: random_state=43,LinearRegression
mse,0.020942,0.020933,0.01895,0.010724,0.01647,0.008583,0.008161,XGBRegressor: random_state=43,LinearRegression
mae,0.112273,0.112272,0.10387,0.079565,0.08949,0.066688,0.06561,XGBRegressor: random_state=43,LinearRegression
r2_score,0.979715,0.979723,0.981644,0.989612,0.984047,0.991686,0.992095,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014703,0.014703,0.013577,0.010405,0.011512,0.008628,0.008471,XGBRegressor: random_state=43,Ridge


## Ronda 6: imputaciones "boxplot" y valor más alto (+ logaritmo, asignación, borrado y escalado)
- Se imputan al siguiente valor más alto de "weight", y al máximo y al mínimo "depth (percentage)" y table

- Se prueban por separado

- Todos menos los "de árboles" mejoran con la imputación al "boxplot", pero ninguno con la del valor máximo, con lo que se obviará completamente


In [None]:
# Se hacen los retoques pertinentes
df_diamonds_6a = df_diamonds.copy()

df_diamonds_6a = remove_all(df_diamonds_6a)
df_diamonds_6a = assign_values(df_diamonds_6a)

df_diamonds_6a[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_diamonds_6a[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_diamonds_6b = df_diamonds_6a.copy()

df_diamonds_6a = impute_next_higher(df_diamonds_6a)


In [None]:
# Se lleva a cabo la prueba
round_6a = Regression(df_diamonds_6a, 'price')
round_6a.split_dataframe(scaler='StandardScaler')
round_6a.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_6a.apply_and_evaluate_kfolds()
round_6a.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.26 sec(s). Total time: 0.26
Starting Ridge:
- Ridge done in 0.14 sec(s). Total time: 0.4
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.95 sec(s). Total time: 5.35
Starting SVR:
- SVR done in 283.45 sec(s). Total time: 288.8
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.71 sec(s). Total time: 291.5
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 223.51 sec(s). Total time: 515.01
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 11.67 sec(s). Total time: 526.68


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144734,0.144699,0.137659,0.103561,0.128256,0.092637,0.09034,XGBRegressor: random_state=43,LinearRegression
mse,0.020948,0.020938,0.01895,0.010725,0.01645,0.008582,0.008161,XGBRegressor: random_state=43,LinearRegression
mae,0.112266,0.112263,0.103868,0.079568,0.08947,0.066704,0.06561,XGBRegressor: random_state=43,LinearRegression
r2_score,0.979709,0.979719,0.981644,0.989611,0.984066,0.991687,0.992095,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014703,0.014702,0.013577,0.010405,0.01151,0.008631,0.008471,XGBRegressor: random_state=43,LinearRegression


In [None]:
df_diamonds_6b = impute_boxplot_min_max(df_diamonds_6b, ['depth (percentage)', 'table (percentage)'])


In [None]:
# Se lleva a cabo la prueba
round_6b = Regression(df_diamonds_6b, 'price')
round_6b.split_dataframe(scaler='StandardScaler')
round_6b.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_6b.apply_and_evaluate_kfolds()
round_6b.create_dataframe()

-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.27 sec(s). Total time: 0.27
Starting Ridge:
- Ridge done in 0.15 sec(s). Total time: 0.42
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.42 sec(s). Total time: 4.84
Starting SVR:
- SVR done in 283.65 sec(s). Total time: 288.49
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.66 sec(s). Total time: 291.15
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 217.07 sec(s). Total time: 508.22
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 10.86 sec(s). Total time: 519.08


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144535,0.144513,0.135645,0.103234,0.127769,0.092711,0.090072,XGBRegressor: random_state=43,LinearRegression
mse,0.02089,0.020884,0.0184,0.010657,0.016325,0.008595,0.008113,XGBRegressor: random_state=43,LinearRegression
mae,0.112202,0.112207,0.102873,0.079384,0.089275,0.066742,0.065363,XGBRegressor: random_state=43,Ridge
r2_score,0.979765,0.979771,0.982178,0.989677,0.984187,0.991674,0.992141,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014692,0.014693,0.013465,0.010383,0.01148,0.008635,0.008428,XGBRegressor: random_state=43,Ridge


## Ronda 7: imputaciones "ridge" (+ imputaciones "boxplot", logaritmo, asignación, borrado y escalado)
- Se imputan los "outliers" restantes de "depth (millimeters)" aplicando un modelo "Ridge" a "weight (carat)", "lenght (millimeters)" y "width (millimeters)", con las que tiene una altísima correlación

- Solo mejora "DecisionTree"

In [None]:
# Se hacen los cambios
df_diamonds_7 = df_diamonds.copy()

df_diamonds_7 = remove_all(df_diamonds_7)
df_diamonds_7 = assign_values(df_diamonds_7)

df_diamonds_7[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_diamonds_7[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_diamonds_7 = impute_boxplot_min_max(df_diamonds_7, ['depth (percentage)', 'table (percentage)'])

df_diamonds_7 = apply_ridge(df_diamonds_7)


In [None]:
# Se prueban los modelos
round_7 = Regression(df_diamonds_7, 'price')
round_7.split_dataframe(scaler='StandardScaler')
round_7.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_7.apply_and_evaluate_kfolds()
round_7.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.27 sec(s). Total time: 0.27
Starting Ridge:
- Ridge done in 0.14 sec(s). Total time: 0.41
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.49 sec(s). Total time: 4.89
Starting SVR:
- SVR done in 278.46 sec(s). Total time: 283.36
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.57 sec(s). Total time: 285.93
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 218.27 sec(s). Total time: 504.2
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 11.51 sec(s). Total time: 515.71


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.144535,0.144514,0.135646,0.103231,0.128829,0.092758,0.090072,XGBRegressor: random_state=43,LinearRegression
mse,0.02089,0.020884,0.0184,0.010657,0.016597,0.008604,0.008113,XGBRegressor: random_state=43,LinearRegression
mae,0.112203,0.112209,0.102874,0.079373,0.089553,0.066782,0.065363,XGBRegressor: random_state=43,Ridge
r2_score,0.979765,0.979771,0.982177,0.989678,0.983924,0.991666,0.992141,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mape,0.014692,0.014693,0.013465,0.010382,0.011522,0.00864,0.008428,XGBRegressor: random_state=43,Ridge


## Ronda 8: sustitución (+ borrado parcial y asignación)
- Se sustituyen los valores de "depth (percentage)" por los resultados reales que da el cálculo a partir de las columnas correspondientes

- Como se ha visto en EDA, el resultado serán muchos nuevos "outliers". Por tanto, se aplicará el cambio en el "dataframe" original sin otras modificaciones para comprobar si supone una mejora

- La única modificación adicional indispensable es el tratamiento de los 0 en esas columnas

- Para comprobar si realmente hay mejora, se aplican los modelos dos veces: una solo con las modificaciones de borrado parcial y asignación, y otra con la sustitución

- Mejoran "SVR" y "RandomForest"

In [None]:
# Se aplican los cambios
df_diamonds_8a = df_diamonds.copy()

df_diamonds_8a = assign_values(df_diamonds_8a)

df_diamonds_8a = remove_all(df_diamonds_8a, zeros_only=True)

df_diamonds_8b = df_diamonds_8a.copy()

df_diamonds_8b['depth (percentage)'] = (df_diamonds_8b['depth (millimeters)'] / ((df_diamonds_8b['lenght (millimeters)']+df_diamonds_8b['width (millimeters)']) / 2)) * 100


In [None]:
# Se ponen a prueba solo el borrado parcial y la asignación
round_8a = Regression(df_diamonds_8a, 'price')
round_8a.split_dataframe()
round_8a.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_8a.apply_and_evaluate_kfolds()
round_8a.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.29 sec(s). Total time: 0.29
Starting Ridge:
- Ridge done in 0.15 sec(s). Total time: 0.44
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.77 sec(s). Total time: 5.21
Starting SVR:
- SVR done in 342.08 sec(s). Total time: 347.29
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.85 sec(s). Total time: 350.14
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 242.36 sec(s). Total time: 592.5
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 11.63 sec(s). Total time: 604.13


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.154343,0.154304,0.178104,0.161211,0.12991,0.092733,0.090902,XGBRegressor: random_state=43,KNeighborsRegressor
mse,0.023822,0.02381,0.031721,0.025989,0.016877,0.008599,0.008263,XGBRegressor: random_state=43,KNeighborsRegressor
mae,0.115972,0.115965,0.133578,0.123142,0.089779,0.066104,0.065636,XGBRegressor: random_state=43,KNeighborsRegressor
r2_score,0.977035,0.977047,0.96942,0.974946,0.98373,0.99171,0.992034,XGBRegressor: random_state=43,SVR
mape,0.015027,0.015026,0.017619,0.015996,0.011615,0.00856,0.008468,XGBRegressor: random_state=43,KNeighborsRegressor


In [None]:
# Se pone a prueba la sustitución
round_8b = Regression(df_diamonds_8b, 'price')
round_8b.split_dataframe()
round_8b.apply_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ],
                        kfolds_num=10
                    )
round_8b.evaluate_metrics()
round_8b.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.33 sec(s). Total time: 0.34
Starting Ridge:
- Ridge done in 0.16 sec(s). Total time: 0.5
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.39 sec(s). Total time: 4.89
Starting SVR:
- SVR done in 357.54 sec(s). Total time: 362.43
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 3.12 sec(s). Total time: 365.54
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 275.21 sec(s). Total time: 640.75
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 14.15 sec(s). Total time: 654.9


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.155793,0.155988,0.178175,0.160784,0.130315,0.092704,0.091244,XGBRegressor: random_state=43,KNeighborsRegressor
mse,0.024272,0.024332,0.031746,0.025851,0.016982,0.008594,0.008325,XGBRegressor: random_state=43,KNeighborsRegressor
mae,0.119707,0.119841,0.133753,0.123292,0.089499,0.066106,0.06643,RandomForestRegressor: random_state=43,KNeighborsRegressor
r2_score,0.976601,0.976543,0.969395,0.975078,0.983629,0.991715,0.991974,XGBRegressor: random_state=43,SVR
mape,0.015672,0.015677,0.017646,0.016013,0.011591,0.008564,0.008588,RandomForestRegressor: random_state=43,KNeighborsRegressor


## Ronda 9: descarte
- Se quitan directamente las columnas con altísima correlación, cercana a 1 ("weight", "lenght", "width" y "depth (millimeters)")

- Se utiliza el "dataframe" sin cambios ni escalado para el contraste

- Todos empeoran respecto a la ronda 1

- Con el descarte, en su lugar, de las de correlación poco relevante, mejoran "Kneighbors" y "SVR"

In [None]:
# Se hace la prueba
df_diamonds_9a = df_diamonds.copy()

df_diamonds_9a = df_diamonds_9a.drop(columns=['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)'])

round_9a = Regression(df_diamonds_9a, 'price')
round_9a.split_dataframe()
round_9a.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_9a.apply_and_evaluate_kfolds()
round_9a.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.19 sec(s). Total time: 0.19
Starting Ridge:
- Ridge done in 0.11 sec(s). Total time: 0.3
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 1.78 sec(s). Total time: 2.07
Starting SVR:
- SVR done in 589.38 sec(s). Total time: 591.45
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 1.23 sec(s). Total time: 592.68
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 95.31 sec(s). Total time: 687.99
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 8.01 sec(s). Total time: 696.0


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.979332,0.979332,1.027323,0.986959,1.187047,1.027097,0.957208,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mse,0.959092,0.959091,1.055393,0.974089,1.40908,1.054928,0.916246,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
mae,0.814146,0.814146,0.827169,0.803355,0.911736,0.81649,0.787496,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43
r2_score,0.083899,0.083899,-0.008085,0.069574,-0.345919,-0.007642,0.124824,XGBRegressor: random_state=43,RandomForestRegressor: random_state=43
mape,0.106981,0.106982,0.108074,0.10537,0.1189,0.106693,0.103121,XGBRegressor: random_state=43,DecisionTreeRegressor: random_state=43


In [None]:
# Para sacar más partido a esta ronda, se prueba también si se eliminan las columnas con una correlación próxima a 0
df_diamonds_9b = df_diamonds.copy()

df_diamonds_9b = df_diamonds_9b.drop(columns=['cut quality', 'depth (percentage)'])

round_9b = Regression(df_diamonds_9b, 'price')
round_9b.split_dataframe()
round_9b.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_9b.apply_and_evaluate_kfolds()
round_9b.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.26 sec(s). Total time: 0.26
Starting Ridge:
- Ridge done in 0.14 sec(s). Total time: 0.39
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 2.27 sec(s). Total time: 2.66
Starting SVR:
- SVR done in 327.55 sec(s). Total time: 330.22
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.25 sec(s). Total time: 332.47
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 182.52 sec(s). Total time: 514.98
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 9.67 sec(s). Total time: 524.66


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.225121,0.225035,0.144074,0.207135,0.131951,0.099942,0.09462,XGBRegressor: random_state=43,LinearRegression
mse,0.05068,0.050641,0.020757,0.042905,0.017411,0.009988,0.008953,XGBRegressor: random_state=43,LinearRegression
mae,0.131742,0.131789,0.10199,0.119594,0.090397,0.068562,0.06687,XGBRegressor: random_state=43,Ridge
r2_score,0.951592,0.951629,0.980173,0.959018,0.983369,0.990459,0.991448,XGBRegressor: random_state=43,SVR
mape,0.016992,0.016998,0.013469,0.015587,0.011638,0.008862,0.008624,XGBRegressor: random_state=43,Ridge


## Ronda 10: imputaciones "clarity quality"
- Se imputan los valores de 7 "clarity quality" al 6. Esto se hace porque se ha visto en el EDA que las variables relacionadas con el tamaño ('weight (carat)', 'lenght (millimeters)', 'width (millimeters)' y 'depth (millimeters)') dejan de disminuir a partir del 6

- Se utiliza el "dataframe" sin cambios ni escalado para el contraste

- Mejoran "LinearRegression", "Ridge" y "KNeighbors" comparados con la ronda 1

In [None]:
# Se comprueba
df_diamonds_10 = df_diamonds.copy()

df_diamonds_10.loc[df_diamonds_10['clarity quality'] == 7, 'clarity quality'] = 6

round_10 = Regression(df_diamonds_10, 'price')
round_10.split_dataframe()
round_10.prepare_models(params_list=[['DecisionTreeRegressor', 'random_state=43'],
                                    ['RandomForestRegressor', 'random_state=43'],
                                    ['XGBRegressor', 'random_state=43']
                                    ]
                        )
round_10.apply_and_evaluate_kfolds()
round_10.create_dataframe()


-- Regression: using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.31 sec(s). Total time: 0.31
Starting Ridge:
- Ridge done in 0.16 sec(s). Total time: 0.47
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 4.75 sec(s). Total time: 5.22
Starting SVR:
- SVR done in 336.01 sec(s). Total time: 341.23
Starting DecisionTreeRegressor: random_state=43:
- DecisionTreeRegressor: random_state=43 done in 2.72 sec(s). Total time: 343.95
Starting RandomForestRegressor: random_state=43:
- RandomForestRegressor: random_state=43 done in 295.15 sec(s). Total time: 639.1
Starting XGBRegressor: random_state=43:
- XGBRegressor: random_state=43 done in 16.21 sec(s). Total time: 655.31


Unnamed: 0,LinearRegression,Ridge,KNeighborsRegressor,SVR,DecisionTreeRegressor: random_state=43,RandomForestRegressor: random_state=43,XGBRegressor: random_state=43,BEST,WORST
rmse,0.220889,0.220774,0.182943,0.209357,0.132265,0.099294,0.094548,XGBRegressor: random_state=43,LinearRegression
mse,0.048792,0.048741,0.033468,0.04383,0.017494,0.009859,0.008939,XGBRegressor: random_state=43,LinearRegression
mae,0.12188,0.121935,0.134568,0.127694,0.089982,0.068342,0.067013,XGBRegressor: random_state=43,KNeighborsRegressor
r2_score,0.953395,0.953444,0.968032,0.958134,0.98329,0.990583,0.991461,XGBRegressor: random_state=43,SVR
mape,0.015769,0.015776,0.017731,0.016556,0.011597,0.008856,0.008664,XGBRegressor: random_state=43,KNeighborsRegressor


In [None]:
stop

# Modelaje: selección de modelos

- Cada uno de los modelos mejora con los siguientes cambios:

---------- LinearRegression (mejor: 0.144352, ronda 4) ----------

 · Borrado

 · Asignación

 · Imputaciones "boxplot"
 
 · Imputaciones "clarity quality"

---------- Ridge (mejor: 0.144347, ronda 4) ----------

· Escalado "MinMax"

· Borrado

· Asignación

· Imputaciones "boxplot"

· Imputaciones "clarity quality"

---------- KNeighborsRegressor (mejor: 0.136072, ronda 6) ----------

· Escalado "Standard"

· Borrado

· Asignación

· Logaritmo

· Imputaciones "boxplot"

· Descarte correlación ínfima

· Imputaciones "clarity quality"

---------- SVR (mejor: 0.103558, ronda 5) ----------

· Escalado "Standard"

· Borrado

· Asignación

· Logaritmo

· Sustitución

· Descarte correlación ínfima

---------- DecisionTree (mejor: 0.126511, ronda 7) ----------

· Borrado

· Logaritmo

· Imputaciones "ridge"

---------- RandomForest (mejor: 0.092534, ronda 4) ----------

· Borrado

· Asignación

· Sustitución

---------- XGBRegressor (mejor: 0.090042, ronda 6) ----------

· Esclado "Standard"

· Borrado

· Asignación

· Imputaciones "boxplot"


## "Ridge" y "LinearRegression"

- Cambios: escalado "MinMax", borrado, asignación, imputaciones "boxplot" e imputaciones "clarity quality"

- Como mejoran con los mismos cambios y "LinearRegression" no se ve afectada por los escalados, se prueban juntos

- Con cambios e hiperparámetros, "LinearRegression" ha mejorado su "rmse" de 0.222038 a 0.143639 (-34.82%)

- Con cambios e hiperparámetros, "Ridge" ha mejorado su "rmse" de 0.221923 a 0.144060 (-35.08%)


In [None]:
# Se crea el "scorer"
rmse = make_scorer(calculate_rmse, greater_is_better=False)


In [None]:
# Se aplican todos los cambios del listado en común
df_ridge_linear = df_diamonds.copy()

df_ridge_linear = remove_all(df_ridge_linear)

df_ridge_linear = assign_values(df_ridge_linear)

df_ridge_linear = impute_boxplot_min_max(df_ridge_linear, ['depth (percentage)', 'table (percentage)'])

df_ridge_linear.loc[df_ridge_linear['clarity quality'] == 7, 'clarity quality'] = 6


In [None]:
# Se prueban en un "dataframe" conjunto
ridge_linear = Regression(df_ridge_linear, 'price')
X_train, X_test, y_train, y_test = ridge_linear.split_dataframe(scaler='MinMaxScaler')
ridge_linear.apply_models(selected_list=['LinearRegression', 'Ridge'],
                        kfolds_num=10
                    )
ridge_linear.evaluate_metrics()
ridge_linear.create_dataframe()


-- Regression (MinMaxScaler): using best of 10 folds --
Starting LinearRegression:
- LinearRegression done in 0.35 sec(s). Total time: 0.35
Starting Ridge:
- Ridge done in 0.18 sec(s). Total time: 0.53


Unnamed: 0,LinearRegression,Ridge,BEST,WORST
rmse,0.143669,0.144156,LinearRegression,Ridge
mse,0.020641,0.020781,LinearRegression,Ridge
mae,0.11168,0.11224,LinearRegression,Ridge
r2_score,0.980007,0.979871,LinearRegression,Ridge
mape,0.014614,0.014686,LinearRegression,Ridge


In [None]:
# Se busca la mejor media de 5 "folds" para la regresión lineal modificando los hiperparámetros
# Se confirma que el mejor modelo para regresión lineal es el que tiene los valores por defecto
model = LinearRegression()

params = {'fit_intercept': [True, False],
            'positive': [True, False],
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END fit_intercept=True, positive=True;, score=-0.184 total time=   0.0s
[CV 2/5] END fit_intercept=True, positive=True;, score=-0.187 total time=   0.0s
[CV 3/5] END fit_intercept=True, positive=True;, score=-0.177 total time=   0.0s
[CV 4/5] END fit_intercept=True, positive=True;, score=-0.184 total time=   0.0s
[CV 5/5] END fit_intercept=True, positive=True;, score=-0.174 total time=   0.0s
[CV 1/5] END fit_intercept=True, positive=False;, score=-0.149 total time=   0.0s
[CV 2/5] END fit_intercept=True, positive=False;, score=-0.149 total time=   0.0s
[CV 3/5] END fit_intercept=True, positive=False;, score=-0.143 total time=   0.0s
[CV 4/5] END fit_intercept=True, positive=False;, score=-0.148 total time=   0.0s
[CV 5/5] END fit_intercept=True, positive=False;, score=-0.144 total time=   0.0s
[CV 1/5] END fit_intercept=False, positive=True;, score=-0.361 total time=   0.0s
[CV 2/5] END fit_intercept=False, positive=

In [None]:
# Mejor puntuación para regresión lineal
mean_squared_error(y_test, y_pred, squared=False)


0.14363969914101388

In [None]:
# Se intenta mejorar el "ridge" modificando los hiperparámetros
# Se obtiene el mejor modelo para "ridge"
model = Ridge()

params = {'fit_intercept': [True, False],
            'alpha': np.linspace(1, 100, num=100),
            'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
            'positive': [True, False]

            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 2800 candidates, totalling 14000 fits
[CV 1/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 2/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 3/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 4/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 5/5] END alpha=1.0, fit_intercept=True, positive=True, solver=svd;, score=nan total time=   0.0s
[CV 1/5] END alpha=1.0, fit_intercept=True, positive=True, solver=cholesky;, score=nan total time=   0.0s
[CV 2/5] END alpha=1.0, fit_intercept=True, positive=True, solver=cholesky;, score=nan total time=   0.0s
[CV 3/5] END alpha=1.0, fit_intercept=True, positive=True, solver=cholesky;, score=nan total time=   0.0s
[CV 4/5] END alpha=1.0, fit_intercept=True, positive=True, solver=cholesky;, score=nan total time=   0.0s
[CV 5

7000 fits failed out of a total of 14000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1000 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Roger\AppData\Local\miniconda3\envs\data_analytics\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Roger\AppData\Local\miniconda3\envs\data_analytics\lib\site-packages\sklearn\linear_model\_ridge.py", line 1134, in fit
    return super().fit(X, y, sample_weight=sample_weight)
  File "c:\Users\Roger\AppData\Local\miniconda3\envs\data_analytics\lib\site-packages\sklearn\linear_model\_ridge.py", line 832, in fit
    raise ValueError(
ValueError: solver='s

In [None]:
# Mejor puntuación para "ridge"
mean_squared_error(y_test, y_pred, squared=False)

0.1440602916434586

## PENDIENTE "KNeighborsRegressor"

- Cambios: escalado "Standard", borrado, asignación, logaritmo, imputaciones "boxplot", descarte correlación ínfima e imputaciones "clarity quality"

- Como no se puede aplicar el logaritmo si hay ceros, se hace una asignación parcial que no incluye los "outliers"

- Con cambios e hiperparámetros, k vecinos ha mejorado su "rmse" de 0.183865 a 0.120311 (-34.56%)


In [None]:
df_neighbors = df_diamonds.copy()

df_neighbors = remove_all(df_neighbors)

df_neighbors = assign_values(df_neighbors)

df_neighbors[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_neighbors[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_neighbors = impute_boxplot_min_max(df_neighbors, ['depth (percentage)', 'table (percentage)'])

df_neighbors = df_neighbors.drop(columns=['cut quality', 'depth (percentage)'])

df_neighbors.loc[df_neighbors['clarity quality'] == 7, 'clarity quality'] = 6


In [None]:
neighbors = Regression(df_neighbors, 'price')
X_train, X_test, y_train, y_test = neighbors.split_dataframe(scaler='StandardScaler')
neighbors.apply_models(selected_list=['KNeighborsRegressor'],
                        kfolds_num=10
                    )
neighbors.evaluate_metrics()
neighbors.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting KNeighborsRegressor:
- KNeighborsRegressor done in 2.65 sec(s). Total time: 2.65


Unnamed: 0,KNeighborsRegressor
mae,0.093622
mape,0.012387
mse,0.015169
r2_score,0.985306
rmse,0.123164


In [None]:
# Se usa el "grid"
model = KNeighborsRegressor()

params = {'n_neighbors': range(5, 21),
            'algorithm': ['ball_tree', 'kd_tree', 'brute'],
            'leaf_size': range(20, 41),
            'metric': ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'],
            'n_jobs': [-1],
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 5040 candidates, totalling 25200 fits
[CV 1/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.124 total time=   5.4s
[CV 2/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.122 total time=   0.4s
[CV 3/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.124 total time=   0.3s
[CV 4/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.122 total time=   0.5s
[CV 5/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=5;, score=-0.123 total time=   0.3s
[CV 1/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=6;, score=-0.123 total time=   0.4s
[CV 2/5] END algorithm=ball_tree, leaf_size=20, metric=cityblock, n_jobs=-1, n_neighbors=6;, score=-0.121 total time=   0.5s
[CV 3/5] END algorithm=ball_tree, leaf_size=20, metric=city

In [None]:
# Mejor puntuación para k vecinos
mean_squared_error(y_test, y_pred, squared=False)


0.12031150183849708

## "SVR"

- Cambios: escalado "Standard", borrado, asignación, logaritmo, imputaciones "boxplot", sustitución y descarte correlación ínfima

- Con cambios e hiperparámetros, "SVR" ha mejorado su "rmse" de 0.209064 a


In [None]:
df_svr = df_diamonds.copy()

df_svr = remove_all(df_svr)

df_svr = assign_values(df_svr)

df_svr[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_svr[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_svr = impute_boxplot_min_max(df_svr, ['depth (percentage)', 'table (percentage)'])

df_svr['depth (percentage)'] = (df_svr['depth (millimeters)'] / ((df_svr['lenght (millimeters)']+df_svr['width (millimeters)']) / 2)) * 100

df_svr = df_svr.drop(columns=['cut quality', 'depth (percentage)'])


In [None]:
svr = Regression(df_svr, 'price')
X_train, X_test, y_train, y_test = svr.split_dataframe(scaler='StandardScaler')
svr.apply_models(selected_list=['SVR'],
                        kfolds_num=10
                    )
svr.evaluate_metrics()
svr.create_dataframe()


-- Regression (StandardScaler): using best of 10 folds --
Starting SVR:
- SVR done in 252.86 sec(s). Total time: 252.86


Unnamed: 0,SVR
mae,0.079636
mape,0.010407
mse,0.010841
r2_score,0.989499
rmse,0.104119


In [None]:
# Como es lento, se empieza por probar solo los diferentes "kernels"
model = SVR()

params = {'kernel': ['linear', 'poly', 'rbf']
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4,
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ....................kernel=linear;, score=-0.151 total time= 1.3min
[CV 2/5] END ....................kernel=linear;, score=-0.152 total time= 1.2min
[CV 3/5] END ....................kernel=linear;, score=-0.148 total time= 1.3min
[CV 4/5] END ....................kernel=linear;, score=-0.149 total time= 1.3min
[CV 5/5] END ....................kernel=linear;, score=-0.145 total time= 1.3min
[CV 1/5] END ......................kernel=poly;, score=-0.506 total time= 1.2min
[CV 2/5] END ......................kernel=poly;, score=-0.507 total time= 1.3min
[CV 3/5] END ......................kernel=poly;, score=-0.468 total time= 1.2min
[CV 4/5] END ......................kernel=poly;, score=-0.492 total time= 1.2min
[CV 5/5] END ......................kernel=poly;, score=-0.446 total time= 1.2min
[CV 1/5] END .......................kernel=rbf;, score=-0.106 total time=  20.8s
[CV 2/5] END .......................kernel=rbf;, 

In [None]:
# "rbf" es el mejor y es bastante rápido. "poly" es tan lento que es inviable iterar con él
# Se prueba con varios "gamma" superiores al estándar
model = SVR()

params = {'kernel': ['rbf'],
            'gamma': range(10, 41),
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4,
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


In [None]:
# Se prueba, también, con varios gamma inferiores al estándar. Este es el mejor "SVR" obtenido
model = SVR()

params = {'kernel': ['rbf'],
            'gamma': [0.0001, 0.0003, 0.0005, 0.0007, 0.001, 0.003, 0.005, 0.007, 0.01, 0.03, 0.05, 0.07, 0.1, 0.3, 0.5, 0.7, 1, 3 , 5, 7],
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4,
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END .........gamma=0.0001, kernel=rbf;, score=-0.986 total time=  53.3s
[CV 2/5] END .........gamma=0.0001, kernel=rbf;, score=-0.988 total time=  53.7s
[CV 3/5] END .........gamma=0.0001, kernel=rbf;, score=-0.959 total time=  56.0s
[CV 4/5] END .........gamma=0.0001, kernel=rbf;, score=-0.968 total time=  56.0s
[CV 5/5] END .........gamma=0.0001, kernel=rbf;, score=-0.969 total time=  55.1s
[CV 1/5] END .........gamma=0.0003, kernel=rbf;, score=-0.990 total time=  54.6s
[CV 2/5] END .........gamma=0.0003, kernel=rbf;, score=-0.992 total time=  54.6s
[CV 3/5] END .........gamma=0.0003, kernel=rbf;, score=-0.962 total time=  54.2s
[CV 4/5] END .........gamma=0.0003, kernel=rbf;, score=-0.971 total time=  54.7s
[CV 5/5] END .........gamma=0.0003, kernel=rbf;, score=-0.972 total time=  54.5s
[CV 1/5] END .........gamma=0.0005, kernel=rbf;, score=-0.991 total time=  54.3s
[CV 2/5] END .........gamma=0.0005, kernel=rbf;

In [None]:
# Mejor puntuación para "SVR"
mean_squared_error(y_test, y_pred, squared=False)


## "DecisionTree"

- Cambios: borrado, logaritmo, imputaciones "ridge" y descarte correlación ínfima

- Como hay que aplicar el logaritmo y la asignación no mejora los resultados, se usa una asignación parcial que imputa los valores 0 pero no afecta a los "outliers"

- Con cambios e hiperparámetros, "DecisionTree" ha mejorado su "rmse" de 0.130142 a


In [None]:
df_tree = df_diamonds.copy()

df_tree = remove_all(df_tree)

df_tree = assign_values(df_tree, outlier=False)

df_tree[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']] = np.log(df_tree[['weight (carat)', 'lenght (millimeters)', 'width (millimeters)', 'depth (millimeters)']])

df_tree = apply_ridge(df_tree)

df_tree = df_tree.drop(columns=['cut quality', 'depth (percentage)'])


In [None]:
tree = Regression(df_tree, 'price')
X_train, X_test, y_train, y_test = tree.split_dataframe()
tree.apply_models(selected_list=['DecisionTreeRegressor'],
                    params_list=[['DecisionTreeRegressor', 'random_state=43']],
                        kfolds_num=10
                    )
tree.evaluate_metrics()
tree.create_dataframe()


In [None]:
# Se usa el "grid"
# En una segunda vuelta se miran los hiperparámentros "max_depth" y "min_samples_split", que están relacionados
model = DecisionTreeRegressor()

params = {'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'splitter': ['best', 'random'],
            'max_features': ['auto', 'sqrt', 'log2'],
            'min_samples_leaf': range(1, 11)
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


In [None]:
# Mejor puntuación para "DecisionTree"
mean_squared_error(y_test, y_pred, squared=False)

## "RandomForest"

- Cambios: borrado, asignación y sustitución

- Con cambios e hiperparámetros, "RandomForest" ha mejorado su "rmse" de 0.098101 a


In [None]:
df_forest = df_diamonds.copy()

df_forest = remove_all(df_forest)

df_forest = assign_values(df_forest)

df_forest['depth (percentage)'] = (df_forest['depth (millimeters)'] / ((df_forest['lenght (millimeters)']+df_forest['width (millimeters)']) / 2)) * 100


In [None]:
forest = Regression(df_forest, 'price')
X_train, X_test, y_train, y_test = forest.split_dataframe()
forest.apply_models(selected_list=['RandomForestRegressor'],
                    params_list=[['RandomForestRegressor', 'random_state=43']],
                        kfolds_num=10
                    )
forest.evaluate_metrics()
forest.create_dataframe()


In [None]:
# En una segunda vuelta se miran los hiperparámentros "max_depth" y "min_samples_split", que están relacionados
# En una tercera vuelta se prueba con "boolstrap=True"
model = RandomForestRegressor()

params = {'criterion': ['gini', 'entropy', 'log_loss'],
            'max_features': ['sqrt', 'log2', None],
            'min_samples_leaf': range(1, 11),
            'oob_score': [True, False]
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


In [None]:
# Mejor puntuación para "RandomForest"
mean_squared_error(y_test, y_pred, squared=False)

## "XGBRegressor"

- Cambios: escalado "standard", borrado, asignación e imputaciones "boxplot"

- Con cambios e hiperparámetros, "XGBRegressor" ha mejorado su "rmse" de 0.094787 a


In [None]:
df_xgb = df_diamonds.copy()

df_xgb = remove_all(df_xgb)

df_xgb = assign_values(df_xgb)

df_xgb = impute_boxplot_min_max(df_xgb, ['depth (percentage)', 'table (percentage)'])


In [None]:
xgb = Regression(df_xgb, 'price')
X_train, X_test, y_train, y_test = xgb.split_dataframe()
xgb.apply_models(selected_list=['XGBRegressor'],
                    params_list=[['XGBRegressor', 'random_state=43']],
                        kfolds_num=10
                    )
xgb.evaluate_metrics()
xgb.create_dataframe()


In [None]:
model = XGBRegressor()

params = {
            }

grid = GridSearchCV(estimator = model,
                    param_grid = params,
                    scoring=rmse,
                    verbose=4
                    )

grid.fit(X_train, y_train)

y_pred = grid.predict(X_test)

print(f'Best params: {grid.best_params_}')


In [None]:
# Mejor puntuación para "XGBRegressor"
mean_squared_error(y_test, y_pred, squared=False)
