<a href="https://colab.research.google.com/github/Primary43/TripDuration-Prediction-based-on-Locational-cluster/blob/main/03_Model_MLFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install mlflow --quiet
! databricks configure --host https://community.cloud.databricks.com/

In [8]:
! pip install pyspark --quiet
! pip install shap --quiet

In [12]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [13]:
import pyspark.pandas as ps
import pandas as pd
import shap



In [14]:
# Read the Parquet file into a pandas-on-Spark DataFrame
X_train_df = pd.read_parquet('/content/gdrive/MyDrive/Colab Notebooks/taxi_duration/training_data/X_train_df.parquet')
X_test_df = pd.read_parquet('/content/gdrive/MyDrive/Colab Notebooks/taxi_duration/training_data/X_test_df.parquet')
X_train_df_geo = pd.read_parquet('/content/gdrive/MyDrive/Colab Notebooks/taxi_duration/training_data/X_train_df_geo.parquet')
X_test_df_geo = pd.read_parquet('/content/gdrive/MyDrive/Colab Notebooks/taxi_duration/training_data/X_test_df_geo.parquet')
y_train_log = pd.read_parquet('/content/gdrive/MyDrive/Colab Notebooks/taxi_duration/training_data/y_train_log.parquet')
y_test_log = pd.read_parquet('/content/gdrive/MyDrive/Colab Notebooks/taxi_duration/training_data/y_test_log.parquet')

As the assumptions required for optimal linear regression performance are not fully met, we will condict robust regression methods to improve a resilience for outliers and violations of regression assumptions, including LASSO, Ridge and Elastic Net

In [15]:
import mlflow
mlflow.set_tracking_uri("databricks")

In [20]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
import numpy as np
import xgboost as xgb



model_classes = {
    #"Ridge": Ridge(),
    #"Lasso": Lasso(),
    #"ElasticNet": ElasticNet(),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror')
}

# Adjusted R2 Function
def adjusted_r2(r2, n, p):
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)


def log_model_metrics(test_x, test_y, test_predictions):
    metrics = {
        'Mean Absolute Error': mae(test_y, test_predictions),
        'Mean Squared Error': mse(test_y, test_predictions),
        'R2': r2_score(test_y, test_predictions),
        'Adjusted R2': adjusted_r2(r2_score(test_y, test_predictions), test_x.shape[0], test_x.shape[1])
    }
    for key, value in metrics.items():
        mlflow.log_metric(key, value)  # Log metric
        print(f"{key}: {value}")


# Model Training and Evaluation Function for regression
def train_and_evaluate_regression(model, model_name, train_x, train_y, test_x, test_y, alphas, n_splits=5):

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    best_alpha = None
    best_score = float('inf')

    for alpha in alphas:
        model.alpha = alpha
        mae_scores = cross_val_score(model, train_x, train_y, scoring='neg_mean_absolute_error', cv=kf)
        mean_mae_score = -np.mean(mae_scores)

        if mean_mae_score < best_score:
            best_score = mean_mae_score
            best_alpha = alpha

    print(f"Best alpha for {model_name}: {best_alpha}")

    # Train and log the best model
    model.alpha = best_alpha
    model.fit(train_x, train_y)
    mlflow.log_param("alpha", best_alpha) # Log alpha

    # Evaluate the model
    test_predictions = model.predict(test_x)
    metrics = log_model_metrics(test_x, test_y, test_predictions)

    # Log the model
    mlflow.sklearn.log_model(model, f"{model_name}_model") # Log model

    return metrics

# Model Training and Evaluation Function for ElasticNet
def train_and_evaluate_ElasticNet(model, model_name, train_x, train_y, test_x, test_y, elastic_net_params):
    best_mae = float('inf')
    best_params = {}

    for alpha in elastic_net_params['alpha']:
        for l1_ratio in elastic_net_params['l1_ratio']:
            model.set_params(alpha=alpha, l1_ratio=l1_ratio)
            model.fit(train_x, train_y)
            predictions = model.predict(test_x)
            current_mae = mae(test_y, predictions)

            if current_mae < best_mae:
                best_mae = current_mae
                best_params = {'alpha': alpha, 'l1_ratio': l1_ratio}

    # Evaluate and log the best model
    model.set_params(**best_params)
    model.fit(train_x, train_y)

    # Log best parameters for XGBoost
    mlflow.log_param("best_params", best_params)  # Log best parameters

    # Evaluate the model
    test_predictions = model.predict(test_x)
    metrics = log_model_metrics(test_x, test_y, test_predictions)

    # Log the model
    mlflow.sklearn.log_model(model, f"{model_name}_model")  # Log model

    return metrics

# Model Training and Evaluation Function for xgboost
def train_and_evaluate_xgboost(model, model_name, train_x, train_y, test_x, test_y, xgb_params):
    best_mae = float('inf')
    best_params = {}

    for lr in xgb_params['learning_rate']:
        for n_est in xgb_params['n_estimators']:
            for md in xgb_params['max_depth']:
                model.set_params(learning_rate=lr, n_estimators=n_est, max_depth=md)
                model.fit(train_x, train_y)
                predictions = model.predict(test_x)
                current_mae = mae(test_y, predictions)

                if current_mae < best_mae:
                    best_mae = current_mae
                    best_params = {'learning_rate': lr, 'n_estimators': n_est, 'max_depth': md}

    # Evaluate and log the best model
    model.set_params(**best_params)
    model.fit(train_x, train_y)

    # Log best parameters for XGBoost
    mlflow.log_param("best_params", best_params)  # Log best parameters

    # Evaluate the model
    test_predictions = model.predict(test_x)
    metrics = log_model_metrics(test_x, test_y, test_predictions)

    # Log the model
    mlflow.sklearn.log_model(model, f"{model_name}_model")  # Log model

    plot_feature_importance(model, X_train[feature_set])
    plot_residuals(y_test, test_predictions)
    return metrics


def run_experiments(X_train, X_test, y_train, y_test, feature_set_name, feature_set, model_classes, n_splits=5):
    # For Regression
    alphas = [0.005, 0.001, 0.01, 0.5, 0.1, 1.0]

    # For XGBoost
    xgb_params = {
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9]
    }

    elastic_net_params = {
                    'alpha': [0.001, 0.01, 0.1, 1, 10],
                    'l1_ratio': [0.1, 0.5, 0.9]
                }

    # Select the subset of features for the current feature set
    X_train_subset = X_train[feature_set]
    X_test_subset = X_test[feature_set]
    # Setup experiement
    mlflow.set_experiment(f"/{_________________}}/{feature_set_name}")
    print(f"Running experiment: {feature_set_name}")

    for model_name, model in model_classes.items():
        with mlflow.start_run(run_name=f"{model_name}"):
          # Log the feature set used as a parameter
            mlflow.log_param("features_used", feature_set)
            mlflow.log_param("features_set", feature_set_name)

            # Train the model and log the results
            if model_name == 'XGBoost':
              metrics = train_and_evaluate_xgboost(model, model_name, X_train_subset, y_train, X_test_subset, y_test, xgb_params)

            elif model_name == 'ElasticNet':
              metrics = train_and_evaluate_ElasticNet(model, model_name, X_train_subset, y_train, X_test_subset, y_test, elastic_net_params)

            else:
              metrics = train_and_evaluate_regression(model, model_name, X_train_subset, y_train, X_test_subset, y_test, alphas, n_splits)

In [56]:
AllFeatures = ['passenger_count', 'distance_km', 'week_of_year', 'vendor_id_2',
       'store_and_fwd_flag_Y', 'hour_bin_13.00-17.59', 'hour_bin_18.00-21.59',
       'hour_bin_22.00-23.59', 'hour_bin_5.00-8.59', 'hour_bin_9.00-12.58',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6']

feature_sets = {
    'AllFeatures': AllFeatures}

# Set the MLflow experiment name based on the feature set
for feature_set_name, feature_set in feature_sets.items():
    run_experiments(X_train_df, X_test_df, y_train_log, y_test_log, feature_set_name, feature_set, model_classes)

2024/01/22 19:40:16 INFO mlflow.tracking.fluent: Experiment with name '/Users/peppa.mint_glrrr@windowslive.com/AllFeatures' does not exist. Creating a new experiment.


Running experiment: AllFeatures
Best alpha for Ridge: 0.001
Mean Absolute Error: 0.1813695185434205
Mean Squared Error: 0.0538694690658953
R2: 0.6452899890850932




Adjusted R2: 0.6452388107093737


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best alpha for Lasso: 0.001
Mean Absolute Error: 0.18188228569050596
Mean Squared Error: 0.05411005837828389




R2: 0.6437057997640703
Adjusted R2: 0.6436543928178742


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Mean Absolute Error: 0.18140892614967685
Mean Squared Error: 0.05388549319644798




R2: 0.645184476266389
Adjusted R2: 0.6451332826670375


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Mean Absolute Error: 0.1569654309295465
Mean Squared Error: 0.04223048660654432




R2: 0.7219282717112816
Adjusted R2: 0.721888150883142


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [57]:
AllFeatures_Geo = ['passenger_count', 'distance_km', 'week_of_year', 'vendor_id_2',
       'store_and_fwd_flag_Y', 'hour_bin_13.00-17.59', 'hour_bin_18.00-21.59',
       'hour_bin_22.00-23.59', 'hour_bin_5.00-8.59', 'hour_bin_9.00-12.58',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'pickup_cluster', 'dropoff_cluster']

feature_sets = {
    'AllFeatures_Geo': AllFeatures_Geo}

for feature_set_name, feature_set in feature_sets.items():
    run_experiments(X_train_df_geo, X_test_df_geo, y_train_log, y_test_log, feature_set_name, feature_set, model_classes)

2024/01/22 19:48:31 INFO mlflow.tracking.fluent: Experiment with name '/Users/peppa.mint_glrrr@windowslive.com/AllFeatures_Geo' does not exist. Creating a new experiment.


Running experiment: AllFeatures_Geo
Best alpha for Ridge: 0.001
Mean Absolute Error: 0.1809986653576032
Mean Squared Error: 0.0536877425628519




R2: 0.646486589144381
Adjusted R2: 0.6464307249608947


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best alpha for Lasso: 0.001
Mean Absolute Error: 0.18156152175107249
Mean Squared Error: 0.05395328920037203




R2: 0.6447380653084032
Adjusted R2: 0.6446819248133164


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Mean Absolute Error: 0.18104152122444667
Mean Squared Error: 0.05370534996597032




R2: 0.6463706510766662
Adjusted R2: 0.6463147685719925


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Mean Absolute Error: 0.13644279650489094
Mean Squared Error: 0.03323852058221538
R2: 0.7811369556271117




Adjusted R2: 0.7811023696541508


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [58]:
SelectedFeatures_Geo = ['passenger_count', 'distance_km', 'week_of_year', 'vendor_id_2',
       'store_and_fwd_flag_Y', 'hour_bin_13.00-17.59', 'hour_bin_18.00-21.59',
       'hour_bin_22.00-23.59', 'hour_bin_5.00-8.59', 'hour_bin_9.00-12.58',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6', 'pickup_cluster', 'dropoff_cluster']

feature_sets = {
    'SelectedFeatures_Geo': SelectedFeatures_Geo}

for feature_set_name, feature_set in feature_sets.items():
    run_experiments(X_train_df_geo, X_test_df_geo, y_train_log, y_test_log, feature_set_name, feature_set, model_classes)

2024/01/22 19:57:41 INFO mlflow.tracking.fluent: Experiment with name '/Users/peppa.mint_glrrr@windowslive.com/SelectedFeatures_Geo' does not exist. Creating a new experiment.


Running experiment: SelectedFeatures_Geo
Best alpha for Ridge: 0.001
Mean Absolute Error: 0.18101055777603564
Mean Squared Error: 0.053704625678649684




R2: 0.6463754202338134
Adjusted R2: 0.6463316881918777


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Best alpha for Lasso: 0.001
Mean Absolute Error: 0.18156741185136688
Mean Squared Error: 0.053957886909208407




R2: 0.6447077911404968
Adjusted R2: 0.6446638528662185


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Mean Absolute Error: 0.1810562936368563
Mean Squared Error: 0.053722097781760035




R2: 0.6462603730653069
Adjusted R2: 0.6462166267957188


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Mean Absolute Error: 0.1364328422024714
Mean Squared Error: 0.03323384402921406




R2: 0.7811677489539239
Adjusted R2: 0.7811406864087073


Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [37]:
mlflow.end_run()
