# NYC Taxi Fare Prediction with MLflow

Now that we know not using MLflow is a bad idea, let's see how we can use MLflow to track our experiments.

In [None]:
import setuptools  # To fix a temporary bug with MLflow. Reported in github.
import mlflow
from mlflow.data.numpy_dataset import from_numpy
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

import xgboost as xgb
import pandas as pd
import pickle

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

## MLFlow Tracking

MLflow Tracking is an API and UI for logging parameters, code versions, metrics, and output files when running machine learning experiments. It allows you to:
- Log parameters (key-value pairs) and metrics (key-value pairs of numeric values) to files and to a database backend.
- Log the current software version.
- Save pickled models to files.
- Start and end runs.
- Search runs based on parameters and metrics.
- Visualize runs

MLflow Tracking is language agnostic. You can use it with any machine learning library, such as scikit-learn, Keras, PyTorch, and XGBoost.

You can start mlflow tracking server with the following command:
```bash
mlflow server --backend-store-uri sqlite:///mlflow.db
```

In [None]:
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("morethan101_nyc-taxi-experiment")

## Data Preparation
We remove the duplication from previous notebook and keep the function to read the data.
Using functions to seperates tasks is a good practice. It makes the code more readable and easier to maintain.

In [None]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df["duration"] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)

    return df

In [None]:
df_train = read_dataframe("./data/green_tripdata_2021-01.parquet")
df_val = read_dataframe("./data/green_tripdata_2021-02.parquet")

In [None]:
len(df_train), len(df_val)

In [None]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

In [None]:
categorical = ["PU_DO"]  #'PULocationID', 'DOLocationID']
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dicts)

In [None]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

## Experiments
We will use the same model as previous notebook. We will use MLflow to track the experiments.
Here you see Lasso model is used, but maybe we have used different models too. But now we can know because we are using MLflow!

`start_run()`: Start a new run. If there is an active run, it will be stopped first. This method is a context manager, so you can use it in a with statement to automatically end the run when the code block is exited.

`set_tag()`: Set a tag on the currently active run. Tags are used to tag runs for later searching. For example, you can tag your runs with the "staging" and "production" tags to later search for all runs that were performed in the staging or production environments. Or you can tag the developer who performed the run with their username.

`log_param()`: Log a parameter under the current run. Parameters are key-value pairs that you can use to record additional information about your run. For example, you can log the learning rate or regularization parameter of your model as a parameter.

`log_metric()`: Log a metric under the current run. Metrics are key-value pairs that you can use to record and compare the performance of your model over time. For example, you can log the accuracy of your model as a metric.

`log_artifact()`: Log a local file or directory as an artifact of the run. Artifacts are files or directories that you want to save along with the run information. For example, you can save a model checkpoint, a summary of the model's performance, or plots of model predictions as artifacts. Artifacts are organized as files and directories in the run's artifact URI.


In [None]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Nusret")

    train_dataset = from_numpy(
        X_train.toarray(),
        "data/green_tripdata_2021-01.parquet",
        name="green_tripdata_2021-01",
    )
    mlflow.log_input(train_dataset, "train")

    test_dataset = from_numpy(
        X_val.toarray(),
        "data/green_tripdata_2021-02.parquet",
        name="green_tripdata_2021-02",
    )
    mlflow.log_input(test_dataset, "test")

    alpha = 0.006

    model = Ridge(alpha)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    signature = infer_signature(X_val, y_pred)

    mlflow.log_param("alpha", alpha)
    mlflow.log_param("model", model.__class__.__name__)
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(model, "model", signature=signature)

## Hyperparameter Tuning
We will use hyperopt library to tune the hyperparameters of the model. We will use MLflow to track the experiments.

In [None]:
mlflow.set_experiment("nyc-taxi-experiment")

In [None]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.log_param("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=2000,
            evals=[(valid, "validation")],
            early_stopping_rounds=50,
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {"loss": rmse, "status": STATUS_OK}

In [None]:
search_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 4, 100, 1)),
    "learning_rate": hp.loguniform("learning_rate", -3, 0),
    "reg_alpha": hp.loguniform("reg_alpha", -5, -1),
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),
    "min_child_weight": hp.loguniform("min_child_weight", -1, 3),
    "objective": "reg:squarederror",
    "seed": 42,
}

trials = Trials()

best_result = fmin(
    fn=objective, space=search_space, algo=tpe.suggest, max_evals=10, trials=trials
)

In [None]:
search_space = {
    "max_depth": scope.int(hp.quniform("max_depth", 10, 40, 1)),
    "learning_rate": hp.loguniform("learning_rate", -2, -1),
    "reg_alpha": hp.loguniform("reg_alpha", -6, -3),
    "reg_lambda": hp.loguniform("reg_lambda", -6, -1),
    "min_child_weight": hp.loguniform("min_child_weight", -3, 1),
    "objective": "reg:squarederror",
    "seed": 42,
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials,
)

This image is what you will see in MLflow UI.

![mlflow-ui](images/mlflow_ui.png)

And this is the result of the hyperparameter tuning for comparing the results. When you look closely, you will see that you can reduce the interval of some hyperparameters.


![mlflow-ui](images/mlflow_ui_2.png)


## Train the best model and save the model this time!

In [None]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.search_experiments()

In [None]:
runs = client.search_runs(
    experiment_ids="2",
    filter_string="metrics.rmse < 7",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.rmse ASC"],
)

params = runs[0].data.params
params

In [None]:
del params["model"]

with mlflow.start_run():
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    # Convert to float if it's all digits or else leave it as it is
    best_params = {k: float(v) if v.isdigit() else v for k, v in params.items()}
    best_params["seed"] = int(best_params["seed"])
    best_params["max_depth"] = int(best_params["max_depth"])

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, "validation")],
        early_stopping_rounds=50,
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

## Auto-logging

Auto-logging captures lots of information about model parameters for you! You can see the parameters and metrics in the UI after you run the code below.

In [None]:
mlflow.set_experiment("mlops_taxi_fare_modeling_sklearn")

In [None]:
mlflow.sklearn.autolog()

models = [Lasso(), Ridge(), LinearRegression(), DecisionTreeRegressor()]

for model in models:
    with mlflow.start_run():
        print(model.__class__.__name__)
        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        model.fit(X_train.toarray(), y_train)

        y_pred = model.predict(X_val.toarray())
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)