# Random Tree Classifier

In [9]:
import os
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
import numpy as np
import matplotlib.pyplot as plt

In [10]:
# Path donde se encuentran mis archivos train
BASE_PATH = "../data/processed"
TRAIN_PATHS = [
    "X_train_con_outliers_raw.xlsx",
    "X_train_sin_outliers_raw.xlsx",
]

# Guardo cada uno de estos archivos dentro de una lista
TRAIN_DATASETS = []
for path in TRAIN_PATHS:
    TRAIN_DATASETS.append(
        # pd.read_excel(BASE_PATH + "/" + path)
        pd.read_excel(f"{BASE_PATH}/{path}")
        # pd.read_excel(os.path.join(BASE_PATH, path))
    )

# Path donde se encuentran mis archivos test
TEST_PATHS = [
    "X_test_con_outliers_raw.xlsx",
    "X_test_sin_outliers_raw.xlsx",
]

# Guardo cada uno de estos archivos dentro de una lista
TEST_DATASETS = []
for path in TEST_PATHS:
    TEST_DATASETS.append(
        pd.read_excel(f"{BASE_PATH}/{path}")
    )

y_train = pd.read_excel(f"{BASE_PATH}/y_train.xlsx")
y_test = pd.read_excel(f"{BASE_PATH}/y_test.xlsx")

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error


results = []

for index, dataset in enumerate(TRAIN_DATASETS):
    model_Random = RandomForestClassifier(random_state=42)
    model_Random.fit(dataset, y_train)
    y_pred_train = model_Random.predict(dataset)
    y_pred_test = model_Random.predict(TEST_DATASETS[index])
    
    results.append({
        "train_accuracy": accuracy_score(y_train, y_pred_train),
        "test_accuracy": accuracy_score(y_test, y_pred_test),
        "train_MSE": mean_squared_error(y_train, y_pred_train),
        "test_MSE": mean_squared_error(y_test, y_pred_test)
    })

df_results = pd.DataFrame(results)
print(df_results)

   train_accuracy  test_accuracy  train_MSE  test_MSE
0             1.0       0.876623        0.0  0.123377
1             1.0       0.883117        0.0  0.116883


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


In [18]:
random_param_grid = {
    'bootstrap': [True],  # Se mantiene True si es el que mejor funcionó
    'class_weight': [None],  # Forzar el balance de clases
    'criterion': ['entropy'],  # Usamos 'entropy' si ha dado mejores resultados
    'max_depth': [2,3,4, 5, 6, 7],  # Explorar valores cercanos al 5, ya que este rango suele controlar el sobreajuste
    'max_features': [0.65, 0.70, 0.75],  # Afinamos alrededor del 0.65 para aumentar la aleatoriedad y diversidad
    'min_samples_leaf': [1, 2],  # Evitar hojas muy pequeñas que puedan sobreajustar
    'min_samples_split': [4, 5],  # Requiere mayor número de muestras para realizar la división
    'n_estimators': [100, 125, 150]  # Incrementar la cantidad de árboles para estabilizar la predicción
}

In [19]:
from sklearn.model_selection import GridSearchCV

best_index_random = 1

model_forest_optimized = RandomForestClassifier(random_state=42)

forest_grid_optimized = GridSearchCV(estimator = model_forest_optimized, param_grid = random_param_grid, scoring = 'accuracy', cv = 5, n_jobs = 14)

forest_grid_optimized.fit(TRAIN_DATASETS[best_index_random], y_train.values.ravel())

r_forest_model = forest_grid_optimized.best_estimator_

y_pred_random_train = r_forest_model.predict(TRAIN_DATASETS[best_index_random])
y_pred_random_test = r_forest_model.predict(TEST_DATASETS[best_index_random])


results_random_optimized = []

results_random_optimized.append({
        "train_accuracy": accuracy_score(y_train, y_pred_random_train),
        "test_accuracy": accuracy_score(y_test, y_pred_random_test),
        "train_f1": f1_score(y_train, y_pred_random_train),
        "test_f1": f1_score(y_test, y_pred_random_test),
        "train_MSE": mean_squared_error(y_train, y_pred_random_train),
        "test_MSE": mean_squared_error(y_test, y_pred_random_test)
    })

df_random_optimized = pd.DataFrame(results_random_optimized)

print(df_random_optimized)
print('-----------------------')
print(f"Error cuadrático medio: {mean_squared_error(y_test, y_pred_random_test)}")
print('-----------------------')
print("Mejores hiperparámetros:", forest_grid_optimized.best_params_)

# Original
#   train_accuracy  test_accuracy  train_f1  test_f1  train_MSE  test_MSE
#0        0.912052       0.883117  0.869565     0.82   0.087948  0.116883
#-----------------------
#Error cuadrático medio: 0.11688311688311688    

# None
#   train_accuracy  test_accuracy  train_f1   test_f1  train_MSE  test_MSE
#0        0.973941       0.896104  0.961722  0.849057   0.026059  0.103896
#-----------------------
#Error cuadrático medio: 0.1038961038961039

# Balanced
#   train_accuracy  test_accuracy  train_f1   test_f1  train_MSE  test_MSE
#0        0.991857        0.88961  0.988235  0.841121   0.008143   0.11039
#-----------------------
#Error cuadrático medio: 0.11038961038961038

   train_accuracy  test_accuracy  train_f1   test_f1  train_MSE  test_MSE
0        0.973941       0.896104  0.961722  0.849057   0.026059  0.103896
-----------------------
Error cuadrático medio: 0.1038961038961039
-----------------------
Mejores hiperparámetros: {'bootstrap': True, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 6, 'max_features': 0.65, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


Puedo observar algo de overfitting en mi modelo obteniendo buenos resultados, esto puede ser posiblemente porque hay un desbalanceo de clases