# Random Tree Algorithm

Tras confirmar que los modelos lineales no pueden resolver este dataset, voy a entrenar un modelo Random Tree.

In [28]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

## Importado

In [29]:
# Path donde se encuentran mis archivos train
BASE_PATH = "../data/processed"
TRAIN_PATHS = [
    "X_train_raw.xlsx",
    "X_train_norm.xlsx",
    "X_train_scal.xlsx",
]

# Guardo cada uno de estos archivos dentro de una lista
TRAIN_DATASETS = []
for path in TRAIN_PATHS:
    TRAIN_DATASETS.append(
        # pd.read_excel(BASE_PATH + "/" + path)
        pd.read_excel(f"{BASE_PATH}/{path}")
        # pd.read_excel(os.path.join(BASE_PATH, path))
    )

# Path donde se encuentran mis archivos test
TEST_PATHS = [
    "X_test_raw.xlsx",
    "X_test_norm.xlsx",
    "X_test_scal.xlsx",
]

# Guardo cada uno de estos archivos dentro de una lista
TEST_DATASETS = []
for path in TEST_PATHS:
    TEST_DATASETS.append(
        pd.read_excel(f"{BASE_PATH}/{path}")
    )

y_train = pd.read_excel(f"{BASE_PATH}/y_train.xlsx")
y_test = pd.read_excel(f"{BASE_PATH}/y_test.xlsx")

In [30]:
results = []
for index, dataset in enumerate(TRAIN_DATASETS):
    model = DecisionTreeRegressor(random_state=42)
    model.fit(dataset, y_train)
    y_pred_train = model.predict(dataset)
    y_pred_test = model.predict(TEST_DATASETS[index])

    results.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })

df_results = pd.DataFrame(results)
# Calcular RMSE
df_results["train_rmse"] = np.sqrt(df_results["train_mse"])
df_results["test_rmse"] = np.sqrt(df_results["test_mse"])
# Seleccionar columnas relevantes para mostrar en tabla
df_results = df_results[["train_rmse", "test_rmse", "train_r2", "test_r2"]]
# Mostrar tabla
print(df_results)

   train_rmse  test_rmse  train_r2   test_r2
0         0.0   4.171055       1.0 -1.495454
1         0.0   4.209249       1.0 -1.541364
2         0.0   4.218529       1.0 -1.552582


In [31]:
param_grid = {
    "max_depth": [3, 5, 8, None],
    "min_samples_split": [2, 5, 10],
    "max_features": ["sqrt", 0.8],
    "criterion": ["squared_error", "absolute_error"]
}

In [33]:
from sklearn.model_selection import GridSearchCV


best_dataset = 0

model_optimized = DecisionTreeRegressor(random_state= 42)

grid_optimized = GridSearchCV(estimator = model_optimized, param_grid = param_grid, scoring = 'r2', cv = 5, n_jobs = 14)


grid_optimized.fit(TRAIN_DATASETS[best_dataset], y_train)
best_model = grid_optimized.best_estimator_


y_pred_train = best_model.predict(TRAIN_DATASETS[best_dataset])
y_pred_test = best_model.predict(TEST_DATASETS[best_dataset])


results_optimized = []

results_optimized.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })

df_optimized = pd.DataFrame(results_optimized)

print(df_optimized)
print('-----------------------')
print(f"Error cuadrático medio: {mean_squared_error(y_test, y_pred_test)}")

#   train_accuracy  test_accuracy  train_f1  test_f1
#0        0.833876       0.727273  0.755981    0.625

#   train_accuracy  test_accuracy  train_f1   test_f1
#0        0.942997       0.837662  0.915663  0.782609

#   train_accuracy  test_accuracy  train_f1   test_f1  2
#0        0.941368       0.857143  0.913462  0.796296



#   train_accuracy  test_accuracy  train_f1   test_f1
#0        0.876221       0.857143  0.816425  0.803571
#-----------------------
#Error cuadrático medio: 0.14285714285714285

# Best
#  train_accuracy  test_accuracy  train_f1  test_f1
# 0        0.912052       0.883117  0.869565     0.82
print('-----------------------')
print("Mejores hiperparámetros:", grid_optimized.best_params_)

   train_r2  train_mse   test_r2  test_mse
0  0.106078    6.54798 -0.112507  7.756126
-----------------------
Error cuadrático medio: 7.756126297783932
-----------------------
Mejores hiperparámetros: {'criterion': 'squared_error', 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_split': 2}


Dado que este modelo tambien me ha dado valores muy bajos, entiendo que no hay suficiente correlacion entre los datos y la variable target.

Esto puede suceder porque los datos sean sintéticos o bien porque faltan variables descriptivas con las cuales poder obtener los resultados.

Para hacer una última comprobación habría que utilizar un modelo de redes neuronales para intentar capturar relaciones complejas entre los datos si es que existen.