In [1]:
# Set repo
import dagshub
import mlflow
dagshub.init(url="https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="Rent a Car")

https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow


<Experiment: artifact_location='mlflow-artifacts:/54cbc10138a74c07a6f0eed4054b5221', creation_time=1727331177951, experiment_id='0', last_update_time=1727331177951, lifecycle_stage='active', name='Rent a Car', tags={}>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('/home/pepechuy/Downloads/School/Lab5/PCD_Rent_a_Car/data/processed.csv')
y = df['rate.daily']
X = df.drop('rate.daily', axis=1)

#Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train-val
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) 

In [3]:
import os
data_dir = '../data'

X_train.to_csv(os.path.join(data_dir, 'X_train.csv'), index=False)
X_val.to_csv(os.path.join(data_dir, 'X_val.csv'), index=False)
y_train.to_csv(os.path.join(data_dir, 'y_train.csv'), index=False)
y_val.to_csv(os.path.join(data_dir, 'y_val.csv'), index=False)

In [4]:
# Convert Pandas DataFrame to NumPy arrays
X_train_np = X_train.to_numpy()
X_val_np = X_val.to_numpy()
y_train_np = y_train.to_numpy()
y_val_np = y_val.to_numpy()

# Use mlflow.data.from_numpy with NumPy arrays
training_dataset = mlflow.data.from_numpy(X_train_np, targets=y_train_np, name="Rent-a-Car-Train")
validation_dataset = mlflow.data.from_numpy(X_val_np, targets=y_val_np, name="Rent-a-Car-Test")

In [5]:
from dagshub import get_repo_bucket_client

# Get a boto3.client object
s3 = get_repo_bucket_client("Pepe-Chuy/PCD_Rent_a_Car")

# Upload file
s3.upload_file(
    Bucket="PCD_Rent_a_Car",  # name of the repo
    Filename="../data/X_train.csv",  # local path of file to upload
    Key="X_train_data.csv",  # remote path where to upload the file
)

s3.upload_file(
    Bucket="PCD_Rent_a_Car",  # name of the repo
    Filename="../data/y_train.csv",  # local path of file to upload
    Key="y_train_data.csv",  # remote path where to upload the file
)

s3.upload_file(
    Bucket="PCD_Rent_a_Car",  # name of the repo
    Filename="../data/X_val.csv",  # local path of file to upload
    Key="X_eval_data.csv",  # remote path where to upload the file
)


# Upload file
s3.upload_file(
    Bucket="PCD_Rent_a_Car",  # name of the repo
    Filename="../data/y_val.csv",  # local path of file to upload
    Key="y_eval_data.csv",  # remote path where to upload the file
)

In [6]:
from sklearn.ensemble import RandomForestRegressor as rfr, GradientBoostingRegressor as gbr
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [7]:
# Random Forest
def objective(params):
    with mlflow.start_run(nested=True):

        #model family
        mlflow.set_tag("model_family", "RandomForest")
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        rf_model = rfr(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            random_state=309
        )
        rf_model.fit(X_train, y_train)
        
        # Cross-validation
        cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        rmse = np.sqrt(-cv_scores.mean())  # Convert negative MSE to RMSE

        # Predict 
        y_pred = rf_model.predict(X_val)
        
        # R-squared
        r_squared = rf_model.score(X_val, y_val)  # Calculate R-squared directly from the model
        
        # Log metrics 
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R_squared", r_squared)
 
    return {'loss': rmse, 'status': STATUS_OK}



In [8]:
mlflow.sklearn.autolog()

#Params for RandomForest
search_space = {
    'n_estimators': hp.quniform('n_estimators', 10, 20, 1),
    'max_depth': hp.quniform('max_depth', 5, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
}


In [9]:
# hyperparameter optimization
with mlflow.start_run(run_name="Father Random Forest Regressor", nested=True):
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best model
    mlflow.log_params(best_params)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

























2024/09/30 18:19:48 INFO mlflow.tracking._tracking_service.client: 🏃 View run orderly-fawn-325 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/afc0d0f512f1487686bd89aafe8279c1.

2024/09/30 18:19:48 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:38<05:44, 38.27s/trial, best loss: 80.22230157461082]

























2024/09/30 18:20:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run learned-ape-864 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/93bdd79f5b2e4910907078837e85cc34.

2024/09/30 18:20:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 20%|██        | 2/10 [01:15<05:01, 37.69s/trial, best loss: 76.26436214296827]

























2024/09/30 18:21:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run dapper-panda-48 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/90b20a039f9d48b88a6c4c3aca18bc71.

2024/09/30 18:21:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 30%|███       | 3/10 [02:11<05:23, 46.14s/trial, best loss: 71.6455228506639] 

























2024/09/30 18:21:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run wistful-eel-257 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/c85c44038d004d01b348e9d60d2b9a6a.

2024/09/30 18:21:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 40%|████      | 4/10 [02:49<04:16, 42.72s/trial, best loss: 71.6455228506639]

























2024/09/30 18:22:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run righteous-grouse-150 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/fa7792b4494c4baab57fe253f8c321ca.

2024/09/30 18:22:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 50%|█████     | 5/10 [03:27<03:25, 41.03s/trial, best loss: 71.6455228506639]

























2024/09/30 18:23:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run loud-penguin-309 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/e7d7a4d12eed4234ae7d178539dd6dc8.

2024/09/30 18:23:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 60%|██████    | 6/10 [04:05<02:40, 40.04s/trial, best loss: 71.6455228506639]

























2024/09/30 18:23:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run awesome-skink-733 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/ceda940ce2ae45208621b4a493c9d9c0.

2024/09/30 18:23:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 70%|███████   | 7/10 [04:41<01:56, 38.86s/trial, best loss: 71.6455228506639]

























2024/09/30 18:24:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run zealous-shrimp-469 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/fe4964426db14560934c1ba030190f93.

2024/09/30 18:24:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 80%|████████  | 8/10 [05:20<01:17, 38.79s/trial, best loss: 71.6455228506639]

























2024/09/30 18:25:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run persistent-finch-296 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/044b2005ef1d4c988beba9f6e2bd3943.

2024/09/30 18:25:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [05:59<00:38, 38.77s/trial, best loss: 71.6455228506639]

























2024/09/30 18:26:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run incongruous-kit-810 at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/c39bce2515dc4eeeaff07df841d50db0.

2024/09/30 18:26:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.



100%|██████████| 10/10 [06:56<00:00, 41.61s/trial, best loss: 71.6455228506639]


2024/09/30 18:26:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run Father Random Forest Regressor at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0/runs/3716820291b14021b0a2270fee19e2f3.
2024/09/30 18:26:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/Pepe-Chuy/PCD_Rent_a_Car.mlflow/#/experiments/0.


In [13]:
best_params

{'max_depth': 9.0,
 'min_samples_leaf': 2.0,
 'min_samples_split': 7.0,
 'n_estimators': 13.0}

In [14]:
run_id = "fa7792b4494c4baab57fe253f8c321ca"
run_uri = f"runs:/{run_id}/model"
 
result = mlflow.register_model(
    model_uri=run_uri,
    name="test-drive-dagshub-model"
)

Registered model 'test-drive-dagshub-model' already exists. Creating a new version of this model...
2024/09/30 18:28:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: test-drive-dagshub-model, version 2
Created version '2' of model 'test-drive-dagshub-model'.
