# Machine Learning

In [1]:
import os
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet, Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

## Importo todos mis datasets

In [2]:
# Path donde se encuentran mis archivos train
BASE_PATH = "../data/processed"
TRAIN_PATHS = [
    "X_train_raw.xlsx",
    "X_train_norm.xlsx",
    "X_train_scal.xlsx",
]

# Guardo cada uno de estos archivos dentro de una lista
TRAIN_DATASETS = []
for path in TRAIN_PATHS:
    TRAIN_DATASETS.append(
        # pd.read_excel(BASE_PATH + "/" + path)
        pd.read_excel(f"{BASE_PATH}/{path}")
        # pd.read_excel(os.path.join(BASE_PATH, path))
    )

# Path donde se encuentran mis archivos test
TEST_PATHS = [
    "X_test_raw.xlsx",
    "X_test_norm.xlsx",
    "X_test_scal.xlsx",
]

# Guardo cada uno de estos archivos dentro de una lista
TEST_DATASETS = []
for path in TEST_PATHS:
    TEST_DATASETS.append(
        pd.read_excel(f"{BASE_PATH}/{path}")
    )

y_train = pd.read_excel(f"{BASE_PATH}/y_train.xlsx")
y_test = pd.read_excel(f"{BASE_PATH}/y_test.xlsx")

### Entreno un modelo sin optimizaciones para encontrar el mejor dataset

In [3]:
results = []
for index, dataset in enumerate(TRAIN_DATASETS):
    model = LinearRegression(n_jobs = -1)
    model.fit(dataset, y_train)
    y_pred_train = model.predict(dataset)
    y_pred_test = model.predict(TEST_DATASETS[index])

    results.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })

df_results = pd.DataFrame(results)
# Calcular RMSE
df_results["train_rmse"] = np.sqrt(df_results["train_mse"])
df_results["test_rmse"] = np.sqrt(df_results["test_mse"])
# Seleccionar columnas relevantes para mostrar en tabla
df_results = df_results[["train_rmse", "test_rmse", "train_r2", "test_r2"]]
# Mostrar tabla
print(df_results)

   train_rmse  test_rmse  train_r2   test_r2
0    2.648457   2.665894  0.042413 -0.019397
1    2.648457   2.665894  0.042413 -0.019397
2    2.648457   2.665894  0.042413 -0.019397


Dado que estos resultados no son buenos, voy a probar con regresion lineal regularizada, para intentar obtener un mejor resultado. Si esto no funciona intentaré un modelo de Tree Selection

In [4]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "alpha": np.linspace(0.0001, 1000, 30),
    "max_iter": [100, 300, 600, 1000, 2000, 4000, 8000, 12000],
    "tol": np.linspace(0.0001, 0.01, 30),
}


param_grid_elastic = {
    "alpha": np.linspace(0.0001, 1000, 30),
    "l1_ratio": [0.1, 0.5, 0.9],
    "max_iter": [100, 300, 1000, 2000, 4000, 8000, 12000],
    "tol": np.linspace(0.0001, 0.01, 30),
}

## Ridge

In [5]:
results = []
for index, dataset in enumerate(TRAIN_DATASETS):
    model = Ridge(random_state=42)
    model.fit(dataset, y_train)
    y_pred_train = model.predict(dataset)
    y_pred_test = model.predict(TEST_DATASETS[index])

    results.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })

df_results = pd.DataFrame(results)
# Calcular RMSE
df_results["train_rmse"] = np.sqrt(df_results["train_mse"])
df_results["test_rmse"] = np.sqrt(df_results["test_mse"])
# Seleccionar columnas relevantes para mostrar en tabla
df_results = df_results[["train_rmse", "test_rmse", "train_r2", "test_r2"]]
# Mostrar tabla
print(df_results)

   train_rmse  test_rmse  train_r2   test_r2
0    2.648473   2.664934  0.042401 -0.018663
1    2.648458   2.665728  0.042413 -0.019270
2    2.648500   2.664658  0.042382 -0.018452


In [6]:
# Indice de mi mejor dataset
best_dataset = 2

ridge_model = Ridge(random_state= 42)

grid_ridge = GridSearchCV(estimator = ridge_model, param_grid = param_grid, scoring = 'r2', cv = 5, n_jobs = 14)

grid_ridge.fit(TRAIN_DATASETS[best_dataset], y_train)
best_model_ridge = grid_ridge.best_estimator_

y_pred_train = best_model_ridge.predict(TRAIN_DATASETS[best_dataset])
y_pred_test = best_model_ridge.predict(TEST_DATASETS[best_dataset])


results_ridge = []

results_ridge.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })

df_ridge = pd.DataFrame(results_ridge)

print(df_ridge)

   train_r2  train_mse   test_r2  test_mse
0  0.003806   7.297124 -0.020374  7.113799


Mis resultados siguen siendo muy bajos

## Lasso

In [7]:
results = []
for index, dataset in enumerate(TRAIN_DATASETS):
    model = Lasso(random_state=42)
    model.fit(dataset, y_train)
    y_pred_train = model.predict(dataset)
    y_pred_test = model.predict(TEST_DATASETS[index])

    results.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })

df_results = pd.DataFrame(results)
# Calcular RMSE
df_results["train_rmse"] = np.sqrt(df_results["train_mse"])
df_results["test_rmse"] = np.sqrt(df_results["test_mse"])
# Seleccionar columnas relevantes para mostrar en tabla
df_results = df_results[["train_rmse", "test_rmse", "train_r2", "test_r2"]]
# Mostrar tabla
print(df_results)

   train_rmse  test_rmse  train_r2   test_r2
0    2.686269   2.657174  0.014875 -0.012739
1    2.706474   2.669451  0.000000 -0.022120
2    2.706474   2.669451  0.000000 -0.022120


In [8]:
from sklearn.linear_model import Lasso

# Carga de los datos de train y test
# Estos datos deben haber sido normalizados y correctamente tratados en un EDA completo
best_dataset = 0

lasso_model = Lasso(random_state=42)

grid_lasso = GridSearchCV(estimator = lasso_model, param_grid = param_grid, scoring = 'r2', cv = 5, n_jobs = 14)

grid_lasso.fit(TRAIN_DATASETS[best_dataset], y_train)
best_model_lasso = grid_lasso.best_estimator_

y_pred_train = best_model_lasso.predict(TRAIN_DATASETS[best_dataset])
y_pred_test = best_model_lasso.predict(TEST_DATASETS[best_dataset])


results_lasso = []

results_lasso.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })

df_lasso = pd.DataFrame(results_lasso)

print(df_lasso)

   train_r2  train_mse  test_r2  test_mse
0       0.0   7.325001 -0.02212  7.125971


## ElasticNet

In [9]:
results = []
for index, dataset in enumerate(TRAIN_DATASETS):
    model = ElasticNet(random_state=42)
    model.fit(dataset, y_train)
    y_pred_train = model.predict(dataset)
    y_pred_test = model.predict(TEST_DATASETS[index])

    results.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })

df_results = pd.DataFrame(results)
# Calcular RMSE
df_results["train_rmse"] = np.sqrt(df_results["train_mse"])
df_results["test_rmse"] = np.sqrt(df_results["test_mse"])
# Seleccionar columnas relevantes para mostrar en tabla
df_results = df_results[["train_rmse", "test_rmse", "train_r2", "test_r2"]]
# Mostrar tabla
print(df_results)

   train_rmse  test_rmse  train_r2   test_r2
0    2.683208   2.660270  0.017119 -0.015101
1    2.706474   2.669451  0.000000 -0.022120
2    2.706474   2.669451  0.000000 -0.022120


In [10]:
from sklearn.linear_model import ElasticNet

best_dataset = 0

elastic_net_model = ElasticNet(random_state=42)

grid_elastic = GridSearchCV(estimator = elastic_net_model, param_grid = param_grid_elastic, scoring = 'r2', cv = 5, n_jobs = 14)

grid_elastic.fit(TRAIN_DATASETS[best_dataset], y_train)
best_model_elastic = grid_elastic.best_estimator_

y_pred_train = best_model_elastic.predict(TRAIN_DATASETS[best_dataset])
y_pred_test = best_model_elastic.predict(TEST_DATASETS[best_dataset])




results_elastic = []

results_elastic.append({
        "train_r2": r2_score(y_train, y_pred_train),
        "train_mse": mean_squared_error(y_train, y_pred_train),
        "test_r2": r2_score(y_test, y_pred_test),
        "test_mse": mean_squared_error(y_test, y_pred_test),
    })

df_elastic = pd.DataFrame(results_elastic)

print(df_elastic)

   train_r2  train_mse  test_r2  test_mse
0       0.0   7.325001 -0.02212  7.125971


Tras estas pruebas, puedo confirmar que mis datos no son lineales(lo que había observado en mi EDA), por lo tanto 