In [None]:
# ===========================================
# 02_Modeling_Experiments.ipynb  |  Pa-Cr-ProyectoFinal
# ===========================================

# --- 1. Librerías
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import matplotlib.pyplot as plt

# --- 2. Cargar y preparar datos
df = pd.read_csv("../data/raw/IBM_Stock_1980_2025.csv", parse_dates=["Date"])
df["Volume"] = df["Volume"].astype(str).str.replace(",", "").astype(float)
df["year"] = df["Date"].dt.year
df["month"] = df["Date"].dt.month
df["dayofweek"] = df["Date"].dt.dayofweek
df["Close_7d_mean"] = df["Close"].rolling(window=7, min_periods=1).mean()
df["Close_30d_mean"] = df["Close"].rolling(window=30, min_periods=1).mean()
df = df.dropna().reset_index(drop=True)

# --- 3. Definir features y target
target = "Close"
drop_cols = ["Date"]
X = df.drop(columns=[target] + drop_cols)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# --- 4. ColumnTransformer + Pipeline
num_features = X.select_dtypes(include=["int64", "float64"]).columns
cat_features = X.select_dtypes(include=["object", "category"]).columns

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_features),
    ("cat", categorical_pipeline, cat_features)
])

# --- 5. Modelos
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
}

results = []

# --- 6. Entrenamiento y evaluación
for name, model in models.items():
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    results.append((name, mse, r2))
    print(f"{name}: MSE={mse:.3f} | R²={r2:.3f}")
    joblib.dump(pipe, f"../models/{name}.joblib")

# --- 7. Comparación visual
results_df = pd.DataFrame(results, columns=["Modelo","MSE","R2"]).set_index("Modelo")
results_df.plot(kind="bar", figsize=(8,4), title="Comparación de modelos (MSE y R²)")
plt.xticks(rotation=0)
plt.show()

# --- 8. Guardar dataset procesado
df.to_csv("../data/processed/df_model_ready.csv", index=False)

# --- 9. Conclusiones
"""
### Conclusiones Modelado

- **Regresión Lineal**: modelo base, explica tendencia general, pero sensible a no linealidades.
- **Random Forest**: mayor capacidad para capturar relaciones no lineales y reduce error MSE.
- Se recomienda optimizar RF con GridSearchCV y agregar más features temporales.
"""
