In [1]:
import pandas as pd
import pickle

import sys
sys.path.append("../")
import src.soporte as sp

In [2]:
# Importamos nuestros datos
df = pd.read_csv("../data/train.csv")
df.head(2)

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.3,Premium,D,SI2,62.4,58.0,4.31,4.28,2.68,6.353
1,1,1.01,Ideal,E,VVS2,62.7,56.0,6.42,6.46,4.04,9.183


In [3]:
# Eliminamos la columna que no vamos a usar
df.drop(["id"], axis = 1, inplace = True)

In [4]:
# Eliminamos las variables poco relevantes, en mi caso, aunque carat, englobe "x", "y", "z", voy a probar con este enfoque
df.drop(["depth", "table", "carat"], axis = 1, inplace = True)

In [5]:
# Con una función, eliminamos los outliers de manera personalizada
especial = {"x" : {"top" : 15},
            "y" : {"top" : 20},
            "z" : {"top" : 10}}
out = sp.detectar_outliers(df, "price", especial)

100%|██████████| 3/3 [00:00<00:00, 599.93it/s]


In [6]:
# Sustituimos los outliers por nulos para después procesarlos
df_sin_out = sp.tratar_outliers(df, out, "null")

100%|██████████| 3/3 [00:00<00:00, 2996.64it/s]


In [7]:
# Tratamos los nulos con el método IterativeImputer
df2 = sp.tratamiento_nulos_num(df_sin_out, metodo = "iterative", respuesta = "price")

In [8]:
# Realizamos encoding de las variables categóricas de forma label a ver que pasa
encoding = {"cut": "label",
        "color": "label",
        "clarity": "label"}
df_enconded = sp.encoder(df2, encoding, modelo = 1)

100%|██████████| 3/3 [00:00<00:00, 199.94it/s]


In [9]:
# Buscamos los parámetros base posibles para nuestros árboles
parametros = sp.mejores_parametros_num(df_enconded, "price")

In [10]:
parametros

{'max_depth': 14,
 'max_features': 3,
 'min_samples_leaf': 25,
 'min_samples_split': 25}

In [11]:
new_parametros = {"max_depth" : [2, 3]+ [x for x in range(4,parametros["max_depth"] + 1, 4)],
                  "max_features": [1, 2, 3, 4],
                  "min_samples_leaf": [25, 50, 75],
                  "min_samples_split": [25, 50, 75]}

In [12]:
# Entrenamos cuatro modelos distintos para comparar sus métricas
metricas = sp.modelos_num(df_enconded, "price", lista = ["tree", "forest", "knn", "gradient"], parametros_tree = new_parametros, scoring = "neg_mean_squared_error", modelo = 1)

100%|██████████| 4/4 [11:14<00:00, 168.57s/it]


In [13]:
# En este caso, cargo las métricas para no regenerar los modelos(mucho tiempo)
with open('../data/metricas_1.pkl', 'rb') as metri:
    metricas = pickle.load(metri)
# Comparamos las métricas, en este caso, los modelos son interesantes.    
metricas

Unnamed: 0,MAE,MSE,RMSE,R2,set,modelo
0,0.098205,0.019,0.137841,0.981395,test,Decision_Tree 1
1,0.092533,0.016829,0.129728,0.983772,train,Decision_Tree 1
0,0.087359,0.014986,0.122416,0.985326,test,Random_Forest 1
1,0.083844,0.013866,0.117754,0.986629,train,Random_Forest 1
0,0.079325,0.013015,0.114085,0.987256,test,KNN 1
1,0.059965,0.007533,0.086794,0.992736,train,KNN 1
0,0.064328,0.008165,0.09036,0.992005,test,Gradient_Booster 1
1,0.053506,0.005679,0.075362,0.994523,train,Gradient_Booster 1


In [14]:
# Entrenamos de nuevo esos modelos pero esta vez con todos los datos
metricas = sp.modelos_num(df_enconded, "price", lista = ["knn", "gradient"], parametros_tree = new_parametros, scoring = "neg_mean_squared_error", comparativa = False, modelo = 1)

100%|██████████| 2/2 [08:43<00:00, 261.57s/it]
