In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Cargar datos
path = '../data/IBM_Stock_1980_2025.csv'
df = pd.read_csv(path, parse_dates=['Date'])


In [10]:
# Limpieza básica Volume
if 'Volume' in df.columns:
    # Normalizar Volume: eliminar comas, espacios y cualquier carácter no numérico
    df['Volume'] = df['Volume'].astype(str).str.replace(r'[^0-9.]', '', regex=True)
    df.loc[df['Volume'] == '', 'Volume'] = np.nan
    df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')
    df['Volume'] = df['Volume'].fillna(df['Volume'].median())

# Features temporales y medias móviles
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['dayofweek'] = df['Date'].dt.dayofweek
df['Close_7d_mean'] = df['Close'].rolling(7, min_periods=1).mean()
df['Close_30d_mean'] = df['Close'].rolling(30, min_periods=1).mean()

# Dropna por rolling
df = df.dropna().reset_index(drop=True)
df.shape


(11488, 15)

In [11]:
# Limpieza básica Volume
if 'Volume' in df.columns:
    # Normalizar Volume: eliminar comas, espacios y cualquier carácter no numérico
    df['Volume'] = df['Volume'].astype(str).str.replace(r'[^0-9.]', '', regex=True)
    df.loc[df['Volume'] == '', 'Volume'] = np.nan
    df['Volume'] = pd.to_numeric(df['Volume'], errors='coerce')
    df['Volume'] = df['Volume'].fillna(df['Volume'].median())

# Features temporales y medias móviles
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['dayofweek'] = df['Date'].dt.dayofweek
df['Close_7d_mean'] = df['Close'].rolling(7, min_periods=1).mean()
df['Close_30d_mean'] = df['Close'].rolling(30, min_periods=1).mean()

# Dropna por rolling
df = df.dropna().reset_index(drop=True)
df.shape


(11488, 15)

In [12]:
target = 'Close'
X = df.drop(columns=[target, 'Date'])
y = df[target]

# Split temporal (NO shuffle)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)


In [13]:
num_features = X.select_dtypes(include=[np.number]).columns.tolist()
cat_features = X.select_dtypes(include=['object','category']).columns.tolist()

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # Use `sparse_output=False` for compatibility with scikit-learn >=1.7
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


In [14]:
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1)
}

results = {}
for name, model in models.items():
    pipe = Pipeline([('preprocessor', preprocessor), ('model', model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    results[name] = {'mse': mse, 'r2': r2}
    print(f"{name} -> MSE: {mse:.4f}  R2: {r2:.4f}")
    # Guardar pipeline completo
    joblib.dump(pipe, f"../models/{name}.joblib")


LinearRegression -> MSE: 0.6678  R2: 0.9995
RandomForest -> MSE: 194.2745  R2: 0.8517
RandomForest -> MSE: 194.2745  R2: 0.8517


In [15]:
import pandas as pd
pd.DataFrame(results).T
#!/usr/bin/env python3

Unnamed: 0,mse,r2
LinearRegression,0.667825,0.99949
RandomForest,194.274516,0.851695
