In [None]:
# Importar librerías
import pandas as pd
import numpy as np
import json
import time
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
# Cargar datos y características
processed_data_path = '../../data/processed/final_processed_data.parquet'
features_path = '../../data/processed/final_features.json'

df = pd.read_parquet(processed_data_path)
with open(features_path, 'r') as f:
    final_features = json.load(f)

print("Datos y características cargados.")

In [None]:
# Regresión Lineal con Pipeline Completo
target_column = 'precio_mxn'
X = df[final_features]
y = df[target_column]

print(f"Forma del dataset: X {X.shape}, y {y.shape}")

# Eliminar filas con valores faltantes si los hay
if X.isnull().sum().sum() > 0 or y.isnull().sum() > 0:
    mask = ~(X.isnull().any(axis=1) | y.isnull())
    X = X[mask]
    y = y[mask]
    print(f"Después de eliminar NaN: X {X.shape}, y {y.shape}")

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear el pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Entrenar el modelo
print("\nEntrenando el modelo...")
start_time = time.time()
pipeline.fit(X_train, y_train)
training_time = time.time() - start_time
print(f"Entrenamiento completado en {training_time:.2f} segundos.")

# Realizar predicciones
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)

# Evaluar el modelo
def evaluate_model(y_true, y_pred, dataset_name=""):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\nResultados {dataset_name}:")
    print(f"R²: {r2:.4f}")
    return {'mse': mse, 'rmse': rmse, 'mae': mae, 'r2': r2}

train_metrics = evaluate_model(y_train, y_pred_train, "Entrenamiento")
test_metrics = evaluate_model(y_test, y_pred_test, "Prueba")

# Guardar el modelo y las métricas
model_path = '../../models/linear_regression_pipeline.joblib'
metrics_path = '../../models/linear_regression_metrics.json'

joblib.dump(pipeline, model_path)
results = {
    'train_metrics': train_metrics,
    'test_metrics': test_metrics,
    'training_time': training_time
}
with open(metrics_path, 'w') as f:
    json.dump(results, f)

print(f"\nModelo guardado en: {model_path}")
print(f"Métricas guardadas en: {metrics_path}")