In [10]:
import pandas as pd
import pickle

import sys
sys.path.append("../")
import src.soporte as sp

In [11]:
# Importamos nuestros datos
df = pd.read_csv("../data/train.csv")
df.head(2)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183


In [12]:
# Eliminamos la columna que no vamos a usar
df.drop(["id"], axis = 1, inplace = True)

In [13]:
# Eliminamos las variables poco relevantes, en mi caso, carat engloba "x", "y", "z", voy a probar con este enfoque
df.drop(["depth", "table", "x", "y", "z"], axis = 1, inplace = True)

In [14]:
# Con una función, eliminamos los outliers de manera personalizada
especial = {"x" : {"top" : 15},
            "y" : {"top" : 20},
            "z" : {"top" : 10}}
out = sp.detectar_outliers(df, "price", especial)

100%|██████████| 1/1 [00:00<00:00, 500.33it/s]


In [15]:
# Sustituimos los outliers por nulos para después procesarlos
df_sin_out = sp.tratar_outliers(df, out, "null")

100%|██████████| 1/1 [00:00<00:00, 22.46it/s]


In [16]:
# Tratamos los nulos con el método IterativeImputer
df2 = sp.tratamiento_nulos_num(df_sin_out, metodo = "iterative", respuesta = "price")

In [17]:
# Realizamos encoding de las variables categóricas de forma label a ver que pasa
encoding = {"cut": "label",
        "color": "label",
        "clarity": "label"}
df_enconded = sp.encoder(df2, encoding, modelo = 3)

100%|██████████| 3/3 [00:00<00:00, 193.31it/s]


In [18]:
# Buscamos los parámetros base posibles para nuestros árboles
parametros = sp.mejores_parametros_num(df_enconded, "price")

In [19]:
parametros

{'max_depth': 20,
 'max_features': 3,
 'min_samples_leaf': 25,
 'min_samples_split': 25}

In [20]:
new_parametros = {"max_depth" : [2, 3]+ [x for x in range(4, parametros["max_depth"] + 1, 4)],
                  "max_features": [1, 2, 3, 4],
                  "min_samples_leaf": [20, 40, 60, 80],
                  "min_samples_split": [20, 40, 60, 80]}

In [21]:
# Entrenamos cuatro modelos distintos para comparar sus métricas
metricas = sp.modelos_num(df_enconded, "price", lista = ["knn", "gradient"], parametros_tree = new_parametros, scoring = "neg_mean_squared_error", modelo = 3)

100%|██████████| 2/2 [17:45<00:00, 532.99s/it]


In [22]:
# En este caso, cargo las métricas para no regenerar los modelos(mucho tiempo)
with open('../data/metricas_3.pkl', 'rb') as metri:
    metricas = pickle.load(metri)
# Comparamos las métricas, en este caso, los modelos son interesantes.    
metricas

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.104896,0.034868,0.18673,0.965857,test,KNN 3
1,0.075773,0.013968,0.118186,0.986531,train,KNN 3
0,0.076399,0.010217,0.101077,0.989996,test,Gradient_Booster 3
1,0.073483,0.009381,0.096856,0.990954,train,Gradient_Booster 3


In [23]:
# Entrenamos de nuevo esos modelos pero esta vez con todos los datos
metricas = sp.modelos_num(df_enconded, "price", lista = ["knn", "gradient"], parametros_tree = new_parametros, scoring = "neg_mean_squared_error", comparativa = False, modelo = 3)

100%|██████████| 2/2 [20:14<00:00, 607.08s/it]
