In [1]:
import dagshub
import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import pathlib
from sklearn.metrics import  root_mean_squared_error
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from sklearn.model_selection import train_test_split

In [1]:
dagshub.init(url="https://dagshub.com/PacoTinoco/Proyecto_Final_CDD", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="google_stock")

2024/09/27 23:34:54 INFO mlflow.tracking.fluent: Experiment with name 'google_stock' does not exist. Creating a new experiment.


https://dagshub.com/PacoTinoco/Proyecto_Final_CDD.mlflow


<Experiment: artifact_location='mlflow-artifacts:/6ec4ca913e1442d7a8287af46c3fa3b1', creation_time=1727501692460, experiment_id='0', last_update_time=1727501692460, lifecycle_stage='active', name='google_stock', tags={}>

In [2]:
google_stock = yf.download('GOOGL', start='2015-01-01', end='2024-01-01')

[*********************100%***********************]  1 of 1 completed


In [4]:
# Definir X e y
X = google_stock.drop(columns=["Close"])
y = google_stock["Close"]

# Dividir en entrenamiento y prueba (80% entrenamiento, 20% prueba)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Tamaño del conjunto de entrenamiento:", X_train.shape, y_train.shape)
print("Tamaño del conjunto de prueba:", X_val.shape, y_val.shape)


Tamaño del conjunto de entrenamiento: (1811, 5) (1811,)
Tamaño del conjunto de prueba: (453, 5) (453,)


In [14]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [15]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "xgboost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, 'validation')],
            early_stopping_rounds=10
        )
        
        # Log xgboost model with artifact_path
        mlflow.xgboost.log_model(booster, artifact_path="model")
         
        # Predict in the val dataset
        y_pred = booster.predict(valid)
        
        # Calculate metric
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
mlflow.xgboost.autolog()

with mlflow.start_run(run_name="Xgboost Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:squarederror',
        'seed': 42
    }
    
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"
    
    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "Google Stock validation",
            "optimizer_engine": "hyper-opt",
            "model_family": "xgboost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, 'validation')],
        early_stopping_rounds=10
    )
        
    y_pred = booster.predict(valid)
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mlflow.log_metric("rmse", rmse)
    
    pathlib.Path("models").mkdir(exist_ok=True)

        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

In [24]:
best_params

{'learning_rate': np.float64(0.11597112725137511),
 'max_depth': 54,
 'min_child_weight': np.float64(1.9078428525972895),
 'reg_alpha': np.float64(0.29322059294206937),
 'reg_lambda': np.float64(0.016297130661752667),
 'seed': 42,
 'objective': 'reg:squarederror'}

In [25]:
import mlflow

In [26]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="google-stock-model"
)

Ingrese el run_id 5fc0756e7f73427cb4b6a8bed889a20b


Successfully registered model 'google-stock-model'.
2024/09/28 13:10:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: google-stock-model, version 1
Created version '1' of model 'google-stock-model'.


In [27]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="google-stock-model",
    description="Model registry for the google stock prediction",
)

new_alias = "champion"
date = datetime.today()
model_version = "1"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="google-stock-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="google-stock-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1727550617515, current_stage='None', description='The model version 1 was transitioned to champion on 2024-09-28 13:12:02.669914', last_updated_timestamp=1727550720469, name='google-stock-model', run_id='5fc0756e7f73427cb4b6a8bed889a20b', run_link='', source='mlflow-artifacts:/6ec4ca913e1442d7a8287af46c3fa3b1/5fc0756e7f73427cb4b6a8bed889a20b/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>

In [28]:
import mlflow.pyfunc

model_name = "google-stock-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([143.453   ,  72.09996 ,  36.609688,  39.58499 ,  28.274122,
       101.74806 ,  55.14543 , 135.74756 ,  27.341633, 100.93057 ,
        98.4985  ,  33.11652 ,  41.786972,  53.575924, 104.2153  ,
       144.40746 ,  78.695564, 109.47353 ,  37.52547 , 109.6177  ,
       114.617744,  74.52254 , 137.93652 ,  88.22144 ,  55.855217,
       141.85316 ,  59.42868 ,  35.214108,  60.18714 ,  66.35824 ,
        27.66563 ,  99.641205, 112.27483 ,  39.037693, 120.42924 ,
       135.73648 ,  59.399837,  68.086845,  42.42141 ,  52.24469 ,
       129.81413 , 140.84976 ,  39.980915,  54.611214, 118.23367 ,
        75.56147 , 147.15639 ,  46.274845, 102.66855 ,  41.178814,
        27.851946, 145.38954 , 133.83778 ,  40.492138, 145.56761 ,
        90.6261  ,  59.570694,  48.011665,  73.25103 , 127.962166,
        52.441124,  30.575087,  90.829254,  71.52325 , 115.75765 ,
        38.918068, 104.460266,  36.359734,  52.231182, 129.63306 ,
       130.11194 ,  56.01812 , 127.7084  ,  52.599   ,  35.273