In [1]:
import pandas as pd
import pickle

import sys
sys.path.append("../")
import src.soporte as sp

In [2]:
# Importamos nuestros datos
df = pd.read_csv("../data/train.csv")
df.head(2)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183


In [3]:
# Eliminamos la columna que no vamos a usar
df.drop(["id"], axis = 1, inplace = True)

In [4]:
# Eliminamos las variables poco relevantes, en mi caso, aunque carat, englobe "x", "y", "z", voy a probar con este enfoque
df.drop(["depth", "table", "carat"], axis = 1, inplace = True)

In [5]:
# Con una función, eliminamos los outliers para tratarlos después
out = sp.detectar_outliers(df, "price")

100%|██████████| 3/3 [00:00<00:00, 374.97it/s]


In [6]:
# Sustituimos los outliers por nulos para después procesarlos
df_sin_out = sp.tratar_outliers(df, out, "null")

100%|██████████| 3/3 [00:00<00:00, 747.34it/s]


In [7]:
# Tratamos los nulos con el método IterativeImputer
df2 = sp.tratamiento_nulos_num(df_sin_out, metodo = "iterative", respuesta = "price")

In [8]:
# Realizamos encoding de las variables categóricas de forma label a ver que pasa
encoding = {"cut": "label",
        "color": "label",
        "clarity": "label"}
df_enconded = sp.encoder(df2, encoding, modelo = 2)

100%|██████████| 3/3 [00:00<00:00, 206.68it/s]


In [9]:
# Buscamos los parámetros base posibles para nuestros árboles
parametros = sp.mejores_parametros_num(df_enconded, "price")

In [10]:
parametros

{'max_depth': 20,
 'max_features': 3,
 'min_samples_leaf': 25,
 'min_samples_split': 25}

In [11]:
new_parametros = {"max_depth" : [2, 3, 4 ,5 , 6]+ [x for x in range(6, parametros["max_depth"] + 1, 4)],
                  "max_features": [1, 2, 3, 4],
                  "min_samples_leaf": [20, 40, 60, 80],
                  "min_samples_split": [20, 40, 60, 80]}

In [12]:
# Entrenamos cuatro modelos distintos para comparar sus métricas
metricas = sp.modelos_num(df_enconded, "price", lista = ["tree", "forest", "knn", "gradient"], parametros_tree = new_parametros, scoring = "neg_mean_squared_error", modelo = 2)

100%|██████████| 4/4 [41:12<00:00, 618.21s/it]


In [13]:
# En este caso, cargo las métricas para no regenerar los modelos(mucho tiempo)
with open('../data/metricas_2.pkl', 'rb') as metri:
    metricas = pickle.load(metri)
# Comparamos las métricas, en este caso, los modelos son interesantes.    
metricas

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.093254,0.017723,0.133128,0.982646,test,Decision_Tree 2
1,0.085066,0.014826,0.121763,0.985703,train,Decision_Tree 2
0,0.081903,0.013743,0.117231,0.986543,test,Random_Forest 2
1,0.077499,0.012675,0.112582,0.987778,train,Random_Forest 2
0,0.079777,0.01474,0.121409,0.985567,test,KNN 2
1,0.05454,0.006765,0.082251,0.993476,train,KNN 2
0,0.06467,0.00847,0.092035,0.991706,test,Gradient_Booster 2
1,0.051235,0.005151,0.071768,0.995033,train,Gradient_Booster 2


In [14]:
# Entrenamos de nuevo esos modelos pero esta vez con todos los datos
metricas = sp.modelos_num(df_enconded, "price", lista = ["knn", "gradient"], parametros_tree = new_parametros, scoring = "neg_mean_squared_error", comparativa = False, modelo = 2)

100%|██████████| 2/2 [40:20<00:00, 1210.11s/it]
