In [0]:
#pip install yfinance

Collecting yfinance
  Downloading yfinance-0.2.66-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting multitasking>=0.0.7 (from yfinance)
  Downloading multitasking-0.0.12.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting frozendict>=2.3.4 (from yfinance)
  Downloading frozendict-2.4.6-py312-none-any.whl.metadata (23 kB)
Collecting peewee>=3.16.2 (from yfinance)
  Downloading peewee-3.18.3.tar.gz (3.0 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing me

### Importar librerias

In [0]:
import yfinance as yf 
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from statsmodels.tsa.arima.model import ARIMA
from scipy.stats import randint
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

### Cargar base de datos

In [0]:
#Definicion base de datos
def df_yf_tidy( 

    tickers, 

    start=None, 

    end=None, 

    interval="1d", 

    auto_adjust=False,  # Se mantiene False para conservar "Adj Close" y así mantener las 6 columnas 

): 

    """ 

    Devuelve un DataFrame 'largo' con múltiples tickers     

    Índice por fecha y columna 'Ticker' para identificar. 

    """ 

    cols = ["Open", "High", "Low", "Close", "Adj Close", "Volume"] 

    frames = [] 

 

    for t in tickers: 

        h = yf.Ticker(t).history( 

            start=start, end=end, interval=interval, auto_adjust=auto_adjust 

        ) 

        if h.empty: 

            continue 

        for c in cols: 

            if c not in h.columns: 

                h[c] = pd.NA 

        out = h[cols].copy() 

        out["Ticker"] = t 

        frames.append(out.reset_index())  # 'Date' pasa a columna 

 

    if not frames: 

        return pd.DataFrame(columns=["Date"] + cols + ["Ticker"]) 

 

    df = pd.concat(frames, ignore_index=True) 

    # Orden de columnas 

    df = df[["Date"] + cols + ["Ticker"]] 

    return df 

 

#Uso para crear base de datos. 

tickers = ["AAPL", "MSFT", "AMZN", "GOOGL", "META"] 

df = df_yf_tidy(tickers, start="2000-01-01", end="2025-10-25", interval="1d") 

 


# Variación relativa 

df["variacion_relativa"] = df["Adj Close"] / df["Adj Close"].shift(1) 

# Variación porcentual (en %) 

df["variacion_porcentual"] = (df["Adj Close"].pct_change()) * 100 

# Variación logarítmica (en valores decimales) 

df["variacion_logaritmica"] = np.log(df["Adj Close"]).diff() 

df.tail() 

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Ticker,variacion_relativa,variacion_porcentual,variacion_logaritmica
28184,2025-10-20 00:00:00-04:00,721.190002,733.77002,720.179993,732.169983,732.169983,8900200,META,1.021272,2.127155,0.021048
28185,2025-10-21 00:00:00-04:00,736.02002,738.5,728.75,733.27002,733.27002,7647300,META,1.001502,0.150243,0.001501
28186,2025-10-22 00:00:00-04:00,733.830017,740.599976,724.030029,733.409973,733.409973,8734500,META,1.000191,0.019086,0.000191
28187,2025-10-23 00:00:00-04:00,734.700012,742.409973,733.099976,734.0,734.0,9856000,META,1.000804,0.08045,0.000804
28188,2025-10-24 00:00:00-04:00,736.789978,741.210022,731.150024,738.359985,738.359985,9151300,META,1.00594,0.594003,0.005922


In [0]:
#Separacion de variables independientes y dependiente
df.dropna(inplace=True)
X = df[['Volume', 'variacion_relativa', 'variacion_porcentual', 'variacion_logaritmica']]
y = df['Adj Close']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


### Modelo Ramdom Forest

In [0]:
# Definir el experimento
experiment_name = "/Users/js.ramirezg123@uniandes.edu.co/sklearn-diab"
mlflow.set_experiment(experiment_name)

rf = RandomForestRegressor(random_state=0)

#Parametros
param_distributions = {
    'n_estimators': [250],
    'max_depth': [8],
    'min_samples_split': [2],
}

#Randomized Search
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=1,  
    cv=5,
    scoring='neg_mean_squared_error',
    random_state=0,
    verbose=2
)

# Entrenamiento en MLflow

with mlflow.start_run(run_name="RandomForest_Experiment_v2"):
    random_search.fit(X_train, y_train)
    y_pred = random_search.predict(X_test)

    # Métricas
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mse ** 0.5
    r2 = r2_score(y_test, y_pred)

    print("MSE:", mse)
    print("MAE:", mae)
    print("RMSE:", rmse)
    print(f"R2: {r2:.4f}")

    #Parámetros y métricas en MLflow
    mlflow.log_params(random_search.best_params_)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Guardar el modelo
    mlflow.sklearn.log_model(random_search.best_estimator_, "model_rf")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END .max_depth=8, min_samples_split=2, n_estimators=250; total time=  12.6s
[CV] END .max_depth=8, min_samples_split=2, n_estimators=250; total time=  12.5s
[CV] END .max_depth=8, min_samples_split=2, n_estimators=250; total time=  12.5s
[CV] END .max_depth=8, min_samples_split=2, n_estimators=250; total time=  12.6s
[CV] END .max_depth=8, min_samples_split=2, n_estimators=250; total time=  12.5s
MSE: 6277.929446086223
MAE: 52.89989409479736
RMSE: 79.23338593097118
R2: 0.4915




### Modelo ARIMA

In [0]:
#Entrenar el modelo ARIMA(A, RI, MA) hiperparámetros que definen el orden del modelo
p, d, q = 2, 1, 2

with mlflow.start_run(run_name="ARIMA_Experiment"):

    #Entrenar modelo ARIMA
    model = ARIMA(y_train, order=(p, d, q))
    model_fit = model.fit()

    #Predicciones sobre el conjunto de test
    y_pred = model_fit.forecast(steps=len(y_test))

    #Calcular métricas
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R2: {r2:.4f}")

    #Registrar parámetros y métricas
    mlflow.log_param("p", p)
    mlflow.log_param("d", d)
    mlflow.log_param("q", q)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    #Registrar el modelo en MLflow
    signature = infer_signature(y_train, y_pred)
    input_example = pd.DataFrame({"y_train_example": y_train[:5].values})

    mlflow.statsmodels.log_model(
        model_fit,
        artifact_path="model",
        signature=signature,
        input_example=input_example
    )


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


MSE: 12347.8154
MAE: 79.7108
RMSE: 111.1207
R2: -0.0001


MlflowException('prediction dataframes for a TimeSeriesModel must have exactly one row and include columns called start and end')Traceback (most recent call last):
  File "/databricks/python/lib/python3.12/site-packages/mlflow/utils/_capture_modules.py", line 166, in load_model_and_predict
    model.predict(input_example, params=params)
  File "/databricks/python/lib/python3.12/site-packages/mlflow/statsmodels/__init__.py", line 362, in predict
    raise MlflowException(
mlflow.exceptions.MlflowException: prediction dataframes for a TimeSeriesModel must have exactly one row and include columns called start and end
  "dataframe_split": {
    "columns": [
      "y_train_example"
    ],
    "data": [
      [
        17.443784713745117
      ],
      [
        36.22602844238281
      ],
      [
        27.050926208496094
      ],
      [
        1.5403321981430054
      ],
      [
        251.081787109375
      ]
    ]
  }
}. Alternatively, you can avoid passing input example and pass mode

### Modelo Regresion lineal

In [0]:
#Definir y entrenar el modelo de regresión lineal
lr = LinearRegression(positive=True)

with mlflow.start_run(run_name="LinearRegression_Experiment"):
    #Entrenar modelo
    lr.fit(X_train, y_train)

    #Predicciones
    y_pred = lr.predict(X_test)

    #Calcular métricas
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R²: {r2:.4f}")

    # ----------------------------------------------------------------
    # 3️⃣ Registrar parámetros, métricas y modelo en MLflow
    # ----------------------------------------------------------------
    mlflow.log_param("fit_intercept", lr.fit_intercept)
    mlflow.log_param("normalize", False)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)

    # Inferir firma del modelo y ejemplo de entrada
    signature = infer_signature(X_train, lr.predict(X_train))
    input_example = X_train.iloc[:5]

    # Registrar modelo
    mlflow.sklearn.log_model(
        lr,
        artifact_path="model",
        signature=signature,
        input_example=input_example
    )


MSE: 12345.6650
MAE: 79.7032
RMSE: 111.1110
R²: 0.0001


