In [5]:
# Set repo
import dagshub
import mlflow
dagshub.init(url="https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="Rent a Car")

https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow


2024/09/26 00:12:57 INFO mlflow.tracking.fluent: Experiment with name 'Rent a Car' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/54cbc10138a74c07a6f0eed4054b5221', creation_time=1727331177951, experiment_id='0', last_update_time=1727331177951, lifecycle_stage='active', name='Rent a Car', tags={}>

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/home/pepechuy/Downloads/School/Lab5/PCD_Rent_a_Car/data/processed.csv')
y = df['rate.daily']
X = df.drop('rate.daily', axis=1)

#Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train-val
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) 

In [8]:
import os
data_dir = '../data'

X_train.to_csv(os.path.join(data_dir, 'X_train.csv'), index=False)
X_val.to_csv(os.path.join(data_dir, 'X_val.csv'), index=False)
y_train.to_csv(os.path.join(data_dir, 'y_train.csv'), index=False)
y_val.to_csv(os.path.join(data_dir, 'y_val.csv'), index=False)

In [10]:
# Convert Pandas DataFrame to NumPy arrays
X_train_np = X_train.to_numpy()
X_val_np = X_val.to_numpy()
y_train_np = y_train.to_numpy()
y_val_np = y_val.to_numpy()

# Use mlflow.data.from_numpy with NumPy arrays
training_dataset = mlflow.data.from_numpy(X_train_np, targets=y_train_np, name="Rent-a-Car-Train")
validation_dataset = mlflow.data.from_numpy(X_val_np, targets=y_val_np, name="Rent-a-Car-Test")

In [12]:
from dagshub import get_repo_bucket_client

# Get a boto3.client object
s3 = get_repo_bucket_client("Pepe-Chuy/PCD_Rent_a_Car")

# Upload file
s3.upload_file(
    Bucket="PCD_Rent_a_Car",  # name of the repo
    Filename="../data/X_train.csv",  # local path of file to upload
    Key="X_train_data.csv",  # remote path where to upload the file
)

s3.upload_file(
    Bucket="PCD_Rent_a_Car",  # name of the repo
    Filename="../data/y_train.csv",  # local path of file to upload
    Key="y_train_data.csv",  # remote path where to upload the file
)

s3.upload_file(
    Bucket="PCD_Rent_a_Car",  # name of the repo
    Filename="../data/X_val.csv",  # local path of file to upload
    Key="X_eval_data.csv",  # remote path where to upload the file
)


# Upload file
s3.upload_file(
    Bucket="PCD_Rent_a_Car",  # name of the repo
    Filename="../data/y_val.csv",  # local path of file to upload
    Key="y_eval_data.csv",  # remote path where to upload the file
)

In [13]:
from sklearn.ensemble import RandomForestRegressor as rfr, GradientBoostingRegressor as gbr
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [14]:
# Random Forest
def objective(params):
    with mlflow.start_run(nested=True):

        #model family
        mlflow.set_tag("model_family", "RandomForest")
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        rf_model = rfr(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            random_state=309
        )
        rf_model.fit(X_train, y_train)
        
        # # Cross-validation
        # cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        # rmse = np.sqrt(-cv_scores.mean())  # Convert negative MSE to RMSE

        # Predict 
        y_pred = rf_model.predict(X_val)
        
        # RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # Log RMSE 
        mlflow.log_metric("rmse", rmse)
 
    return {'loss': rmse, 'status': STATUS_OK}

In [15]:
mlflow.sklearn.autolog()

#Params for RandomForest
search_space = {
    'n_estimators': hp.quniform('n_estimators', 10, 20, 1),
    'max_depth': hp.quniform('max_depth', 5, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
}


In [16]:
# hyperparameter optimization
with mlflow.start_run(run_name="Father Random Forest Regressor", nested=True):
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best model
    mlflow.log_params(best_params)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]





2024/09/26 00:15:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run wise-deer-737 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/831205fcc04d4cc39f895b07ae5c575f.

2024/09/26 00:15:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:08<01:16,  8.50s/trial, best loss: 73.30808879706555]





2024/09/26 00:15:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run crawling-flea-133 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/598885e6d941494f89766d24e1259e0f.

2024/09/26 00:15:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 20%|██        | 2/10 [00:16<01:03,  7.95s/trial, best loss: 73.30808879706555]





2024/09/26 00:15:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run rare-zebra-349 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/36a7852388da4cd2bd2d05b5488b6e60.

2024/09/26 00:15:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 30%|███       | 3/10 [00:24<00:55,  7.96s/trial, best loss: 73.30808879706555]





2024/09/26 00:15:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run nimble-hog-623 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/4d61a4d83690485e8b7616ab38ad4212.

2024/09/26 00:15:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 40%|████      | 4/10 [00:31<00:46,  7.71s/trial, best loss: 73.30808879706555]





2024/09/26 00:15:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run thundering-shrew-543 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/c7c783dc901b4f25a059f61426a08795.

2024/09/26 00:15:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 50%|█████     | 5/10 [00:38<00:37,  7.54s/trial, best loss: 67.35198564384822]





2024/09/26 00:16:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run illustrious-slug-624 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/016b29de47674fffbef2694a5a223f95.

2024/09/26 00:16:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 60%|██████    | 6/10 [00:45<00:29,  7.48s/trial, best loss: 67.35198564384822]





2024/09/26 00:16:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run likeable-jay-924 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/7421ecbdedf744d7b044abbcf7d934fa.

2024/09/26 00:16:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 70%|███████   | 7/10 [00:53<00:22,  7.52s/trial, best loss: 67.35198564384822]





2024/09/26 00:16:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run silent-wasp-702 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/4b17930c99954e8fa90a50d206073bda.

2024/09/26 00:16:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 80%|████████  | 8/10 [01:00<00:14,  7.34s/trial, best loss: 67.35198564384822]





2024/09/26 00:16:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run mercurial-shrimp-745 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/af886e706df74888bb5ce194cd744a08.

2024/09/26 00:16:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [01:07<00:07,  7.37s/trial, best loss: 67.35198564384822]





2024/09/26 00:16:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run awesome-wolf-568 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/bb19e6cb6c1c4535b4afec4a1455237b.

2024/09/26 00:16:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



100%|██████████| 10/10 [01:15<00:00,  7.53s/trial, best loss: 67.35198564384822]


2024/09/26 00:16:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run Father Random Forest Regressor at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/d02a54171c6246ef9b819e4b62903ad0.
2024/09/26 00:16:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.


In [17]:
best_params

{'max_depth': 8.0,
 'min_samples_leaf': 1.0,
 'min_samples_split': 3.0,
 'n_estimators': 18.0}

In [18]:
run_id = "c7c783dc901b4f25a059f61426a08795"
run_uri = f"runs:/{run_id}/model"
 
result = mlflow.register_model(
    model_uri=run_uri,
    name="test-drive-dagshub-model"
)

Successfully registered model 'test-drive-dagshub-model'.
2024/09/26 00:18:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: test-drive-dagshub-model, version 1
Created version '1' of model 'test-drive-dagshub-model'.
