In [2]:
import hopsworks
from hsfs.client.exceptions import RestAPIError

try:
    # Establish connection to Hopsworks
    project = hopsworks.login(
        api_key_value="O4IOxWozstKu0BFQ.07C1tbvgVI5C4XNLbLrGH4PS4t0EqBYN00ex8318TNIkl82WwDi3Vh9MidMrCA83"
    )
    print("Successfully connected to Hopsworks.")
except Exception as e:
    print("Failed to connect to Hopsworks.")
    print(e)
    exit(1)  # Exit if the connection fails

try:
    # Access the Feature Store
    fs = project.get_feature_store()

    # Retrieve the feature group by name and version
    feature_group = fs.get_feature_group("final_merge_df", version=1)

    # Read the feature group as a DataFrame
    df = feature_group.read()

    # Print confirmation and the first few rows of the DataFrame
    print("Downloaded feature group: final_merge_df (version 1)")
    print(df.head())

except RestAPIError as e:
    print("Error downloading feature group: final_merge_df (version 1)")
    print(e)
except Exception as e:
    print("An unexpected error occurred.")
    print(e)


Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1074326
Successfully connected to Hopsworks.

Connected. Call `.close()` to terminate connection gracefully.
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.61s) 
Downloaded feature group: final_merge_df (version 1)
   truck_id    route_id            departure_date         estimated_arrival  \
0  21999423  R-65cfb635 2019-01-01 07:00:00+00:00 2019-01-01 12:00:00+00:00   
1  25951646  R-05f9858a 2019-01-10 07:00:00+00:00 2019-01-11 01:00:00+00:00   
2  31766919  R-08c77b90 2019-01-26 07:00:00+00:00 2019-01-27 08:00:00+00:00   
3  33921946  R-5cc73966 2019-01-16 07:00:00+00:00 2019-01-16 08:00:00+00:00   
4  13792231  R-71af6d34 2019-01-13 07:00:00+00:00 2019-01-13 18:00:00+00:00   

   delay  route_avg_temp  route_avg_wind_speed  route_avg_precip  \
0      0       48.000000              8.000000               0.0   
1     

In [3]:
df.isnull().sum()

truck_id                          0
route_id                          0
departure_date                    0
estimated_arrival                 0
delay                             0
route_avg_temp                    0
route_avg_wind_speed              0
route_avg_precip                  0
route_avg_humidity                0
route_avg_visibility              0
route_avg_pressure                0
route_description                 0
estimated_arrival_nearest_hour    0
departure_date_nearest_hour       0
origin_id                         0
destination_id                    0
distance                          0
average_hours                     0
temp_x                            0
wind_speed_x                      0
description_x                     0
precip_x                          0
humidity_x                        0
visibility_x                      0
pressure_x                        0
temp_y                            0
wind_speed_y                      0
description_y               

Checking the date range

In [4]:
final_merge = df

In [5]:
final_merge['estimated_arrival'].min(), final_merge['estimated_arrival'].max()

(Timestamp('2019-01-01 07:00:00+0000', tz='UTC'),
 Timestamp('2019-02-13 06:00:00+0000', tz='UTC'))

Splitting the data into training, validation, and test sets based on date

In [6]:
final_merge.columns


Index(['truck_id', 'route_id', 'departure_date', 'estimated_arrival', 'delay',
       'route_avg_temp', 'route_avg_wind_speed', 'route_avg_precip',
       'route_avg_humidity', 'route_avg_visibility', 'route_avg_pressure',
       'route_description', 'estimated_arrival_nearest_hour',
       'departure_date_nearest_hour', 'origin_id', 'destination_id',
       'distance', 'average_hours', 'temp_x', 'wind_speed_x', 'description_x',
       'precip_x', 'humidity_x', 'visibility_x', 'pressure_x', 'temp_y',
       'wind_speed_y', 'description_y', 'precip_y', 'humidity_y',
       'visibility_y', 'pressure_y', 'avg_no_of_vehicles', 'accident',
       'truck_age', 'load_capacity_pounds', 'mileage_mpg', 'fuel_type',
       'driver_id', 'name', 'gender', 'age', 'experience', 'driving_style',
       'ratings', 'vehicle_no', 'average_speed_mph', 'is_midnight',
       'unique_id', 'event_time'],
      dtype='object')

In [7]:
# Ensure the suffixes '_x' and '_y' are replaced with '_origin' and '_destination'
final_merge.columns = final_merge.columns.str.replace('_x$', '_origin', regex=True)
final_merge.columns = final_merge.columns.str.replace('_y$', '_destination', regex=True)

# Optional: Verify the changes
print("Updated Column Names:")
print(df.columns)

Updated Column Names:
Index(['truck_id', 'route_id', 'departure_date', 'estimated_arrival', 'delay',
       'route_avg_temp', 'route_avg_wind_speed', 'route_avg_precip',
       'route_avg_humidity', 'route_avg_visibility', 'route_avg_pressure',
       'route_description', 'estimated_arrival_nearest_hour',
       'departure_date_nearest_hour', 'origin_id', 'destination_id',
       'distance', 'average_hours', 'temp_origin', 'wind_speed_origin',
       'description_origin', 'precip_origin', 'humidity_origin',
       'visibility_origin', 'pressure_origin', 'temp_destination',
       'wind_speed_destination', 'description_destination',
       'precip_destination', 'humidity_destination', 'visibility_destination',
       'pressure_destination', 'avg_no_of_vehicles', 'accident', 'truck_age',
       'load_capacity_pounds', 'mileage_mpg', 'fuel_type', 'driver_id', 'name',
       'gender', 'age', 'experience', 'driving_style', 'ratings', 'vehicle_no',
       'average_speed_mph', 'is_midnight', 

Updated Code to Rename Columns

In [8]:
# Dictionary mapping actual column names to expected column names
rename_mapping = {
    'origin_temp': 'temp_origin',
    'origin_wind_speed': 'wind_speed_origin',
    'origin_precip': 'precip_origin',
    'origin_humidity': 'humidity_origin',
    'origin_visibility': 'visibility_origin',
    'origin_pressure': 'pressure_origin',
    'destination_temp': 'temp_destination',
    'destination_wind_speed': 'wind_speed_destination',
    'destination_precip': 'precip_destination',
    'destination_humidity': 'humidity_destination',
    'destination_visibility': 'visibility_destination',
    'destination_pressure': 'pressure_destination',
    'description_origin': 'origin_description',
    'description_destination': 'destination_description'
}

# Apply the renaming to the DataFrame
final_merge = final_merge.rename(columns=rename_mapping)

# Optional: Verify the changes
print("Updated Column Names:")
print(final_merge.columns)


Updated Column Names:
Index(['truck_id', 'route_id', 'departure_date', 'estimated_arrival', 'delay',
       'route_avg_temp', 'route_avg_wind_speed', 'route_avg_precip',
       'route_avg_humidity', 'route_avg_visibility', 'route_avg_pressure',
       'route_description', 'estimated_arrival_nearest_hour',
       'departure_date_nearest_hour', 'origin_id', 'destination_id',
       'distance', 'average_hours', 'temp_origin', 'wind_speed_origin',
       'origin_description', 'precip_origin', 'humidity_origin',
       'visibility_origin', 'pressure_origin', 'temp_destination',
       'wind_speed_destination', 'destination_description',
       'precip_destination', 'humidity_destination', 'visibility_destination',
       'pressure_destination', 'avg_no_of_vehicles', 'accident', 'truck_age',
       'load_capacity_pounds', 'mileage_mpg', 'fuel_type', 'driver_id', 'name',
       'gender', 'age', 'experience', 'driving_style', 'ratings', 'vehicle_no',
       'average_speed_mph', 'is_midnight', 

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [10]:
cts_cols=['route_avg_temp', 'route_avg_wind_speed',
              'route_avg_precip', 'route_avg_humidity', 'route_avg_visibility',
              'route_avg_pressure', 'distance', 'average_hours',
              'origin_temp', 'origin_wind_speed', 'origin_precip', 'origin_humidity',
              'origin_visibility', 'origin_pressure',
              'destination_temp','destination_wind_speed','destination_precip',
              'destination_humidity', 'destination_visibility','destination_pressure',
               'avg_no_of_vehicles', 'truck_age','load_capacity_pounds', 'mileage_mpg',
               'age', 'experience','average_speed_mph']

cat_cols=['route_description',
              'origin_description', 'destination_description',
               'accident', 'fuel_type',
              'gender', 'driving_style', 'ratings','is_midnight']

target=['delay']

In [11]:
#Checking the date range
final_merge['estimated_arrival'].min(), final_merge['estimated_arrival'].max()

(Timestamp('2019-01-01 07:00:00+0000', tz='UTC'),
 Timestamp('2019-02-13 06:00:00+0000', tz='UTC'))

Splitting the data into training, validation, and test sets based on date

In [12]:
# Step 1: Ensure 'estimated_arrival' is timezone-aware (UTC)
final_merge['estimated_arrival'] = final_merge['estimated_arrival'].dt.tz_convert('UTC')

In [13]:
# Create a UTC-aware comparison timestamp
comparison_date = pd.to_datetime('2019-01-30', utc=True)

# Filter the DataFrame based on the comparison
train_df = final_merge[final_merge['estimated_arrival'] <= comparison_date]

# Optional: Display the result to verify
print(train_df.head())


   truck_id    route_id            departure_date         estimated_arrival  \
0  21999423  R-65cfb635 2019-01-01 07:00:00+00:00 2019-01-01 12:00:00+00:00   
1  25951646  R-05f9858a 2019-01-10 07:00:00+00:00 2019-01-11 01:00:00+00:00   
2  31766919  R-08c77b90 2019-01-26 07:00:00+00:00 2019-01-27 08:00:00+00:00   
3  33921946  R-5cc73966 2019-01-16 07:00:00+00:00 2019-01-16 08:00:00+00:00   
4  13792231  R-71af6d34 2019-01-13 07:00:00+00:00 2019-01-13 18:00:00+00:00   

   delay  route_avg_temp  route_avg_wind_speed  route_avg_precip  \
0      0       48.000000              8.000000               0.0   
1      0       48.400000              6.200000               0.0   
2      0       36.833333             15.166667               0.0   
3      0       58.000000              7.000000               0.0   
4      0       40.000000              6.000000               0.0   

   route_avg_humidity  route_avg_visibility  ...  gender   age experience  \
0           58.500000                  

In [14]:
# Create UTC-aware comparison timestamps
start_date = pd.to_datetime('2019-01-30', utc=True)
end_date = pd.to_datetime('2019-02-07', utc=True)

# Filter the DataFrame based on the date range
validation_df = final_merge[
    (final_merge['estimated_arrival'] > start_date) & 
    (final_merge['estimated_arrival'] <= end_date)
]

In [15]:
# Create a UTC-aware timestamp for comparison
comparison_date_1 = pd.to_datetime('2019-02-07', utc=True)

# Filter the DataFrame to get the test set
test_df = final_merge[final_merge['estimated_arrival'] > comparison_date_1]


In [16]:
# Ensure that only available columns are used to avoid KeyError
available_cts_cols = [col for col in cts_cols if col in train_df.columns]
available_cat_cols = [col for col in cat_cols if col in train_df.columns]

# Combine continuous and categorical columns that are present
selected_columns = available_cts_cols + available_cat_cols

# Ensure there are columns to use
if not selected_columns:
    print("No valid columns found for training.")
else:
    # Select the available columns from the DataFrame
    X_train = train_df[selected_columns]
    y_train = train_df['delay']  # Assuming 'delay' is the target variable

    # Optional: Print the shape to confirm selection
    print(f"X_train shape: {X_train.shape}")


X_train shape: (7296, 24)


In [17]:
y_train=train_df['delay']

In [18]:
# Ensure only the columns that are present in validation_df are selected
available_cts_cols = [col for col in cts_cols if col in validation_df.columns]
available_cat_cols = [col for col in cat_cols if col in validation_df.columns]

# Combine available continuous and categorical columns
selected_columns = available_cts_cols + available_cat_cols

# Check if any valid columns are available
if not selected_columns:
    raise ValueError("No valid columns found in validation_df for feature selection.")

# Select the columns from validation_df
X_valid = validation_df[selected_columns]

# Select the target variable (assuming 'delay' is the target)
y_valid = validation_df['delay']

# Print the shape to confirm
print(f"X_valid shape: {X_valid.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_valid shape: (1949, 24)
y_valid shape: (1949,)


In [19]:
y_valid = validation_df['delay']

In [20]:
# Ensure only available columns are selected from test_df
available_cts_cols = [col for col in cts_cols if col in test_df.columns]
available_cat_cols = [col for col in cat_cols if col in test_df.columns]

# Combine available continuous and categorical columns
selected_columns = available_cts_cols + available_cat_cols

# Check if any valid columns are available
if not selected_columns:
    raise ValueError("No valid columns found in test_df for feature selection.")

# Select the available columns for X_test
X_test = test_df[selected_columns]

# Select the target variable (assuming 'delay' is the target)
y_test = test_df['delay']

# Optional: Print the shape to confirm
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")


X_test shape: (1227, 24)
y_test shape: (1227,)


In [21]:
y_test=test_df['delay']

**Encoding

In [22]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [23]:
# Step 1: Define the columns to be one-hot encoded
encode_columns = ['route_description', 'origin_description', 'destination_description', 'fuel_type', 'gender', 'driving_style']


In [24]:
# Step 2: Initialize the OneHotEncoder with updated parameter
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [25]:
# Step 3: Fit the encoder on the training data
encoder.fit(X_train[encode_columns])

Generating names for the new one-hot encoded features

In [26]:
encoded_features = list(encoder.get_feature_names_out(encode_columns))

- Transforming the training, validation, and test sets

In [27]:
X_train[encoded_features] = encoder.transform(X_train[encode_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

In [28]:
X_valid[encoded_features] = encoder.transform(X_valid[encode_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

In [29]:
X_test[encoded_features] = encoder.transform(X_test[encode_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

- Dropping the original categorical features

In [30]:
X_train = X_train.drop(encode_columns, axis=1)

In [31]:
X_valid = X_valid.drop(encode_columns, axis=1)

In [32]:
X_test = X_test.drop(encode_columns, axis=1)

Scaling Numerical Features

In [33]:
from sklearn.preprocessing import StandardScaler

# Step 1: Initialize the StandardScaler
scaler = StandardScaler()

# Step 2: Identify numerical columns to scale
# Ensure we only scale the encoded categorical columns and other numerical columns if needed.
columns_to_scale = X_train.columns  # Adjust if you only want specific columns

# Step 3: Fit the scaler on X_train and transform X_train, X_valid, and X_test
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train[columns_to_scale]), 
                              columns=columns_to_scale, index=X_train.index)

X_valid_scaled = pd.DataFrame(scaler.transform(X_valid[columns_to_scale]), 
                              columns=columns_to_scale, index=X_valid.index)

X_test_scaled = pd.DataFrame(scaler.transform(X_test[columns_to_scale]), 
                             columns=columns_to_scale, index=X_test.index)

# Optional: Verify the shapes
print(f"X_train shape after scaling: {X_train_scaled.shape}")
print(f"X_valid shape after scaling: {X_valid_scaled.shape}")
print(f"X_test shape after scaling: {X_test_scaled.shape}")


X_train shape after scaling: (7296, 132)
X_valid shape after scaling: (1949, 132)
X_test shape after scaling: (1227, 132)


In [34]:
pip install mlflow scikit-learn xgboost

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [45]:
X_train_scaled.columns


Index(['route_avg_temp', 'route_avg_wind_speed', 'route_avg_precip',
       'route_avg_humidity', 'route_avg_visibility', 'route_avg_pressure',
       'distance', 'average_hours', 'avg_no_of_vehicles', 'truck_age',
       ...
       'destination_description_Patchy snow possible',
       'destination_description_Sunny',
       'destination_description_Thundery outbreaks possible',
       'destination_description_Torrential rain shower', 'fuel_type_diesel',
       'fuel_type_gas', 'gender_female', 'gender_male',
       'driving_style_conservative', 'driving_style_proactive'],
      dtype='object', length=132)

In [35]:
#Validation Dataset
import mlflow
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Define X_train, X_valid, y_train, y_valid (use your existing datasets)
X = X_train_scaled  # Assuming scaled features
y = y_train

# Split the data further for GridSearch if needed
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize MLflow
mlflow.set_experiment("ML Models with Hyperparameter Tuning")

# Define a function to train and log models with MLflow
def train_and_evaluate_model(model, param_grid, model_name):
    with mlflow.start_run(run_name=model_name):
        # GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)
        grid_search.fit(X_train_split, y_train_split)

        # Get the best model from GridSearch
        best_model = grid_search.best_estimator_

        # Predict on validation data
        y_pred = best_model.predict(X_val_split)

        # Evaluate performance
        acc = accuracy_score(y_val_split, y_pred)
        f1 = f1_score(y_val_split, y_pred, average='weighted')

        # Log parameters, metrics, and model
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics({"accuracy": acc, "f1_score": f1})
        mlflow.sklearn.log_model(best_model, model_name)

        # Print the results
        print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
        print(f"Accuracy: {acc}, F1 Score: {f1}")
        print(classification_report(y_val_split, y_pred))

# Hyperparameter grids for each model
logreg_params = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 200]
}

rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

xgb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10]
}

# Train and evaluate models
train_and_evaluate_model(LogisticRegression(), logreg_params, "Logistic Regression")
train_and_evaluate_model(RandomForestClassifier(), rf_params, "Random Forest")
train_and_evaluate_model(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, "XGBoost")


Fitting 3 folds for each of 12 candidates, totalling 36 fits





Best Parameters for Logistic Regression: {'C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'}
Accuracy: 0.7493150684931507, F1 Score: 0.7228845969625737
              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1020
           1       0.66      0.35      0.46       440

    accuracy                           0.75      1460
   macro avg       0.71      0.64      0.65      1460
weighted avg       0.73      0.75      0.72      1460

Fitting 3 folds for each of 36 candidates, totalling 108 fits





Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.7808219178082192, F1 Score: 0.7618291754685148
              precision    recall  f1-score   support

           0       0.79      0.93      0.86      1020
           1       0.73      0.43      0.54       440

    accuracy                           0.78      1460
   macro avg       0.76      0.68      0.70      1460
weighted avg       0.77      0.78      0.76      1460

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Parameters: { "use_label_encoder" } are not used.







Best Parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 50}
Accuracy: 0.7849315068493151, F1 Score: 0.765700426854712
              precision    recall  f1-score   support

           0       0.79      0.94      0.86      1020
           1       0.74      0.44      0.55       440

    accuracy                           0.78      1460
   macro avg       0.77      0.69      0.70      1460
weighted avg       0.78      0.78      0.77      1460



In [36]:
# #Accuracy for the validation dataset
# from sklearn.metrics import classification_report, accuracy_score, f1_score

# # Function to train and evaluate the model, including validation accuracy
# def train_and_evaluate_model(model, param_grid, model_name):
#     with mlflow.start_run(run_name=model_name):
#         # GridSearchCV for hyperparameter tuning
#         grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)
#         grid_search.fit(X_train_split, y_train_split)

#         # Get the best model from GridSearch
#         best_model = grid_search.best_estimator_

#         # Predict on validation data
#         y_val_pred = best_model.predict(X_val_split)

#         # Calculate validation accuracy and F1 score
#         val_accuracy = accuracy_score(y_val_split, y_val_pred)
#         val_f1 = f1_score(y_val_split, y_val_pred, average='weighted')

#         # Log parameters, metrics, and model with MLflow
#         mlflow.log_params(grid_search.best_params_)
#         mlflow.log_metrics({"val_accuracy": val_accuracy, "val_f1_score": val_f1})
#         mlflow.sklearn.log_model(best_model, model_name)

#         # Print the evaluation results
#         print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
#         print(f"Validation Accuracy: {val_accuracy:.4f}, F1 Score: {val_f1:.4f}")
#         print(classification_report(y_val_split, y_val_pred))

# # Example hyperparameter grids
# logreg_params = {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear'], 'max_iter': [100, 200]}
# rf_params = {'n_estimators': [50, 100], 'max_depth': [None, 10], 'min_samples_split': [2, 5]}
# xgb_params = {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 6]}

# # Train models with the new function
# train_and_evaluate_model(LogisticRegression(), logreg_params, "Logistic Regression")
# train_and_evaluate_model(RandomForestClassifier(), rf_params, "Random Forest")
# train_and_evaluate_model(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, "XGBoost")


In [37]:
# #Training dataset
# import mlflow
# import mlflow.sklearn
# from sklearn.model_selection import GridSearchCV, train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

# # Define X_train, y_train (use your existing datasets)
# X_train_split = X_train_scaled  # Assuming scaled features
# y_train_split = y_train  # Assuming labels

# # Initialize MLflow
# mlflow.set_experiment("ML Models with Hyperparameter Tuning")

# # Define a function to train and log models with MLflow
# def train_and_evaluate_on_train(model, param_grid, model_name):
#     with mlflow.start_run(run_name=model_name):
#         # GridSearchCV for hyperparameter tuning
#         grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)
#         grid_search.fit(X_train_split, y_train_split)

#         # Get the best model from GridSearch
#         best_model = grid_search.best_estimator_

#         # Evaluate on the training dataset
#         y_train_pred = best_model.predict(X_train_split)
#         train_acc = accuracy_score(y_train_split, y_train_pred)
#         train_f1 = f1_score(y_train_split, y_train_pred, average='weighted')
#         train_precision = precision_score(y_train_split, y_train_pred, average='weighted')
#         train_recall = recall_score(y_train_split, y_train_pred, average='weighted')

#         # Log parameters, metrics, and model
#         mlflow.log_params(grid_search.best_params_)
#         mlflow.log_metrics({"train_accuracy": train_acc, "train_f1_score": train_f1,
#                             "train_precision": train_precision, "train_recall": train_recall})
#         mlflow.sklearn.log_model(best_model, model_name)

#         # Print the evaluation results for training
#         print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
#         print(f"Training Accuracy: {train_acc:.4f}, F1 Score: {train_f1:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}")
#         print("Training classification report:")
#         print(classification_report(y_train_split, y_train_pred))

# # Hyperparameter grids for each model
# logreg_params = {
#     'C': [0.1, 1, 10],
#     'solver': ['lbfgs', 'liblinear'],
#     'max_iter': [100, 200]
# }

# rf_params = {
#     'n_estimators': [50, 175, 150],
#     'max_depth': [None, 10, 15],
#     'min_samples_split': [2, 3],
#     'min_samples_leaf': [1, 3]
# }

# xgb_params = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 6, 10]
# }

# # Train and evaluate models on training data only
# train_and_evaluate_on_train(LogisticRegression(), logreg_params, "Logistic Regression")
# train_and_evaluate_on_train(RandomForestClassifier(), rf_params, "Random Forest")
# train_and_evaluate_on_train(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, "XGBoost")


In [38]:
# #Testing dataset
# import mlflow
# import mlflow.sklearn
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

# # Define X_test and y_test (use your existing testing datasets)
# X_test_split = X_test_scaled  # Assuming scaled test features
# y_test_split = y_test  # Assuming test labels

# # Initialize MLflow
# mlflow.set_experiment("ML Models with Hyperparameter Tuning")

# # Define a function to evaluate models on testing data with MLflow
# def evaluate_on_test(model, param_grid, model_name):
#     with mlflow.start_run(run_name=model_name):
#         # GridSearchCV for hyperparameter tuning
#         grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)
#         grid_search.fit(X_train_split, y_train_split)  # Train on the training set

#         # Get the best model from GridSearch
#         best_model = grid_search.best_estimator_

#         # Evaluate on the testing dataset
#         y_test_pred = best_model.predict(X_test_split)
#         test_acc = accuracy_score(y_test_split, y_test_pred)
#         test_f1 = f1_score(y_test_split, y_test_pred, average='weighted')
#         test_precision = precision_score(y_test_split, y_test_pred, average='weighted')
#         test_recall = recall_score(y_test_split, y_test_pred, average='weighted')

#         # Log parameters, metrics, and model
#         mlflow.log_params(grid_search.best_params_)
#         mlflow.log_metrics({"test_accuracy": test_acc, "test_f1_score": test_f1,
#                             "test_precision": test_precision, "test_recall": test_recall})
#         mlflow.sklearn.log_model(best_model, model_name)

#         # Print the evaluation results for testing
#         print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
#         print(f"Test Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}")
#         print("Test classification report:")
#         print(classification_report(y_test_split, y_test_pred))

# # Hyperparameter grids for each model
# logreg_params = {
#     'C': [0.1, 1, 10],
#     'solver': ['lbfgs', 'liblinear'],
#     'max_iter': [100, 200]
# }

# rf_params = {
#     'n_estimators': [50, 150, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5],
#     'min_samples_leaf': [1, 2]
# }

# xgb_params = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 6, 10]
# }

# # Train models on training set and evaluate on testing data
# evaluate_on_test(LogisticRegression(), logreg_params, "Logistic Regression")
# evaluate_on_test(RandomForestClassifier(), rf_params, "Random Forest")
# evaluate_on_test(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, "XGBoost")


In [39]:
# #Training data
# import mlflow
# import mlflow.sklearn
# import pickle
# from sklearn.model_selection import GridSearchCV
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

# # Define X_train, y_train (use your existing datasets)
# X_train_split = X_train_scaled  # Assuming scaled features
# y_train_split = y_train  # Assuming labels

# # Initialize MLflow
# mlflow.set_experiment("ML Models with Hyperparameter Tuning")

# # Define a function to train, evaluate, and save models as pickle files
# def train_evaluate_save_as_pickle(model, param_grid, model_name):
#     with mlflow.start_run(run_name=model_name):
#         # GridSearchCV for hyperparameter tuning
#         grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)
#         grid_search.fit(X_train_split, y_train_split)

#         # Get the best model from GridSearch
#         best_model = grid_search.best_estimator_

#         # Evaluate on the training dataset
#         y_train_pred = best_model.predict(X_train_split)
#         train_acc = accuracy_score(y_train_split, y_train_pred)
#         train_f1 = f1_score(y_train_split, y_train_pred, average='weighted')
#         train_precision = precision_score(y_train_split, y_train_pred, average='weighted')
#         train_recall = recall_score(y_train_split, y_train_pred, average='weighted')

#         # Log parameters, metrics, and model with MLflow
#         mlflow.log_params(grid_search.best_params_)
#         mlflow.log_metrics({"train_accuracy": train_acc, "train_f1_score": train_f1,
#                             "train_precision": train_precision, "train_recall": train_recall})
#         mlflow.sklearn.log_model(best_model, model_name)

#         # Save model as a pickle file
#         with open(f"./models/{model_name}_model.pkl", 'wb') as f:
#             pickle.dump(best_model, f)
#         print(f"Model saved as {model_name}_model.pkl")

#         # Print the evaluation results for training
#         print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
#         print(f"Training Accuracy: {train_acc:.4f}, F1 Score: {train_f1:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}")
#         print("Training classification report:")
#         print(classification_report(y_train_split, y_train_pred))

# # Hyperparameter grids for each model
# logreg_params = {
#     'C': [0.1, 1, 10],
#     'solver': ['lbfgs', 'liblinear'],
#     'max_iter': [100, 200]
# }

# rf_params = {
#     'n_estimators': [50, 175, 150],
#     'max_depth': [None, 10, 15],
#     'min_samples_split': [2, 3],
#     'min_samples_leaf': [1, 3]
# }

# xgb_params = {
#     'n_estimators': [50, 100, 200],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 6, 10]
# }

# # Train, evaluate, and save models on training data
# train_evaluate_save_as_pickle(LogisticRegression(), logreg_params, "Logistic_Regression")
# train_evaluate_save_as_pickle(RandomForestClassifier(), rf_params, "Random_Forest")
# train_evaluate_save_as_pickle(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, "XGBoost")


In [40]:
pip install hopsworks





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
required_columns = ['accident', 'is_midnight', 'ratings']
missing_columns = [col for col in required_columns if col not in X_train_split.columns]

if missing_columns:
    print(f"Missing columns: {missing_columns}")
else:
    print("All required columns are present.")

All required columns are present.


In [43]:
print(X_train_split.columns)
print(y_train_split.name)

Index(['route_avg_temp', 'route_avg_wind_speed', 'route_avg_precip',
       'route_avg_humidity', 'route_avg_visibility', 'route_avg_pressure',
       'distance', 'average_hours', 'avg_no_of_vehicles', 'truck_age',
       ...
       'destination_description_Patchy snow possible',
       'destination_description_Sunny',
       'destination_description_Thundery outbreaks possible',
       'destination_description_Torrential rain shower', 'fuel_type_diesel',
       'fuel_type_gas', 'gender_female', 'gender_male',
       'driving_style_conservative', 'driving_style_proactive'],
      dtype='object', length=132)
delay


In [42]:
#Training Data
import mlflow
import mlflow.sklearn
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import hopsworks
import joblib
import os

# Assuming X_train_split and y_train_split are your feature and label datasets
# Define X_train_split, y_train_split (use your existing datasets)
X_train_split = X_train_scaled  # Assuming scaled features
y_train_split = y_train  # Assuming labels

# Initialize MLflow experiment
mlflow.set_experiment("ML Models with Hyperparameter Tuning")

# Hyperparameter grids for each model
logreg_params = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 200]
}

rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 15],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 3]
}

xgb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10]
}

# Function to perform model training, evaluation, and saving
def train_and_log_model(model, param_grid, model_name):
    # Start MLflow run
    with mlflow.start_run(run_name=model_name):
        # GridSearchCV for hyperparameter tuning
        grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)
        grid_search.fit(X_train_split, y_train_split)
        
        # Get the best model from GridSearchCV
        best_model = grid_search.best_estimator_
        
        # Evaluate model on the training set
        y_train_pred = best_model.predict(X_train_split)
        train_acc = accuracy_score(y_train_split, y_train_pred)
        train_f1 = f1_score(y_train_split, y_train_pred, average='weighted')
        train_precision = precision_score(y_train_split, y_train_pred, average='weighted')
        train_recall = recall_score(y_train_split, y_train_pred, average='weighted')
        
        # Log parameters, metrics, and the model with MLflow
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics({"train_accuracy": train_acc, "train_f1_score": train_f1,
                            "train_precision": train_precision, "train_recall": train_recall})
        mlflow.sklearn.log_model(best_model, model_name)
        
        # Save the model as a pickle file
        pickle_file = f"./models/{model_name}_model.pkl"
        with open(pickle_file, 'wb') as f:
            pickle.dump(best_model, f)
        print(f"Model saved as {model_name}_model.pkl")
        
        # Print evaluation results
        print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
        print(f"Training Accuracy: {train_acc:.4f}, F1 Score: {train_f1:.4f}, Precision: {train_precision:.4f}, Recall: {train_recall:.4f}")
        print("Training classification report:")
        print(classification_report(y_train_split, y_train_pred))
        
        return best_model, {"accuracy": train_acc, "f1_score": train_f1, "precision": train_precision, "recall": train_recall}

# Train models and get the best models and their metrics
best_logistic_model, logistic_metrics = train_and_log_model(LogisticRegression(), logreg_params, "Logistic_Regression")
best_rf_model, rf_metrics = train_and_log_model(RandomForestClassifier(), rf_params, "Random_Forest")
best_xgb_model, xgb_metrics = train_and_log_model(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, "XGBoost")

# Save XGBoost model to Hopsworks
# Connect to Hopsworks
project = hopsworks.login(api_key_value="O4IOxWozstKu0BFQ.07C1tbvgVI5C4XNLbLrGH4PS4t0EqBYN00ex8318TNIkl82WwDi3Vh9MidMrCA83")

# Get the Model Registry
mr = project.get_model_registry()

# Directory to save the model locally (for later upload to Hopsworks)
model_dir = "xgboost_model_directory"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save the XGBoost model locally using joblib
joblib.dump(best_xgb_model, f"{model_dir}/xgboost_model.pkl")

# Upload the model to Hopsworks Model Registry
model_meta = mr.python.create_model(
    name="XGBoost_Model",
    description="Best XGBoost model trained with hyperparameter tuning",
    metrics=xgb_metrics,  # Dictionary of evaluation metrics
    input_example=X_train_split[0:1]  # Optional: Example of input data
)

# Upload the model to the registry
model_meta.save(f"{model_dir}/xgboost_model.pkl")

print(f"Model 'XGBoost_Model' successfully saved to Hopsworks Model Registry.")


Fitting 3 folds for each of 12 candidates, totalling 36 fits





FileNotFoundError: [Errno 2] No such file or directory: './models/Logistic_Regression_model.pkl'

In [None]:
#Testing data
import mlflow
import mlflow.sklearn
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import hopsworks
import joblib
import os

# Assuming X_test_split and y_test_split are your feature and label datasets
# Define X_test_split, y_test_split (use your existing datasets)
X_test_split = X_test_scaled  # Assuming scaled features
y_test_split = y_test  # Assuming labels

# Initialize MLflow experiment
mlflow.set_experiment("ML Models with Hyperparameter Tuning")

# Hyperparameter grids for each model
logreg_params = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 200]
}

rf_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 15],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 3]
}

xgb_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10]
}

# Function to perform model testing, evaluation, and logging
def evaluate_and_log_model(model, param_grid, model_name):
    # Start MLflow run
    with mlflow.start_run(run_name=model_name):
        # GridSearchCV for hyperparameter tuning (already done with training data)
        grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=2)
        grid_search.fit(X_train_split, y_train_split)  # This uses the best parameters from training
        
        
        # Get the best model from GridSearchCV
        best_model = grid_search.best_estimator_
        
        # Evaluate on the testing dataset
        y_test_pred = best_model.predict(X_test_split)
        test_acc = accuracy_score(y_test_split, y_test_pred)
        test_f1 = f1_score(y_test_split, y_test_pred, average='weighted')
        test_precision = precision_score(y_test_split, y_test_pred, average='weighted')
        test_recall = recall_score(y_test_split, y_test_pred, average='weighted')
        
        # Log parameters, metrics, and the model with MLflow
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics({
            "test_accuracy": test_acc, 
            "test_f1_score": test_f1,
            "test_precision": test_precision, 
            "test_recall": test_recall
        })
        mlflow.sklearn.log_model(best_model, model_name)
        
        # Save the model as a pickle file
        pickle_file = f"./models/{model_name}_model.pkl"
        with open(pickle_file, 'wb') as f:
            pickle.dump(best_model, f)
        print(f"Model saved as {model_name}_model.pkl")
        
        # Print evaluation results
        print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
        print(f"Test Accuracy: {test_acc:.4f}, F1 Score: {test_f1:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}")
        print("Test classification report:")
        print(classification_report(y_test_split, y_test_pred))
        
        return best_model, {"accuracy": test_acc, "f1_score": test_f1, "precision": test_precision, "recall": test_recall}

# Evaluate models and get the best models and their metrics
best_logistic_model, logistic_metrics = evaluate_and_log_model(LogisticRegression(), logreg_params, "Logistic_Regression")
best_rf_model, rf_metrics = evaluate_and_log_model(RandomForestClassifier(), rf_params, "Random_Forest")
best_xgb_model, xgb_metrics = evaluate_and_log_model(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgb_params, "XGBoost")

# Save XGBoost model to Hopsworks
# Connect to Hopsworks
project = hopsworks.login(api_key_value="O4IOxWozstKu0BFQ.07C1tbvgVI5C4XNLbLrGH4PS4t0EqBYN00ex8318TNIkl82WwDi3Vh9MidMrCA83")

# Get the Model Registry
mr = project.get_model_registry()

# Directory to save the model locally (for later upload to Hopsworks)
model_dir = "xgboost_model_directory"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save the XGBoost model locally using joblib
joblib.dump(best_xgb_model, f"{model_dir}/xgboost_model.pkl")

# Upload the model to Hopsworks Model Registry
model_meta = mr.python.create_model(
    name="XGBoost_Model_Test",
    description="Best XGBoost model evaluated on test data",
    metrics=xgb_metrics,  # Dictionary of evaluation metrics from the test data
    input_example=X_test_split[0:1]  # Optional: Example of input data
)

# Upload the model to the registry
model_meta.save(f"{model_dir}/xgboost_model.pkl")

print(f"Model 'XGBoost_Model_Test' successfully saved to Hopsworks Model Registry.")


NameError: name 'X_test_scaled' is not defined