# Fleet Analytics & Prediction System

Author: Christopher F. Ogbechie , Lukman Ibrahim and Mathew Asare
        
Course: ANLT202  

## Project Objectives
This notebook prepares the dataset for:
- Trip delay risk classification
- Maintenance cost regression modeling

## Dataset
Source: Kaggle – Fleet Dataset  
File: fleet_dummy_5000.csv

## Classification Task
Predict whether a trip is operationally risky or problematic.

We define a trip as High Risk (1) if the status indicates a delay or problem
(e.g., "Delayed", "Cancelled", "Failed"). Otherwise, it is Low Risk (0).

This helps the fleet manager identify trips likely to cause service issues.

## Regression Task
Predict the maintenance_cost of a vehicle/trip based on:
- distance_km
- vehicle age / type
- driving behaviour (violations, speeding incidents)
- fuel_cost, toll_cost, load_value
- weather_condition and route information

## Load Dataset

In [None]:
import pandas as pd

df = pd.read_csv("../data/fleet_dummy_5000.csv")# Load the fleet dataset

df.head() # Preview the first few rows

In [None]:
df["status"].value_counts() # Count how many trips fall under each status category

## Define Classification Label

In [None]:
# Normalize status to lowercase
df["status_lower"] = df["status"].str.lower()

# Define high risk trips (only Delayed)
df["high_risk"] = (df["status_lower"] == "delayed").astype(int)

# Check result
df["high_risk"].value_counts()

## Define Features and Targets

In [None]:
# Convert pickup_time to datetime
df["pickup_time"] = pd.to_datetime(df["pickup_time"])

# Add time-based features
df["pickup_hour"] = df["pickup_time"].dt.hour
df["pickup_dayofweek"] = df["pickup_time"].dt.dayofweek

# Feature columns
features = [
    "distance_km",
    "fuel_cost",
    "driver_pay",
    "toll_cost",
    "load_value",
    "violation_count",
    "speeding_incidents",
    "gps_start_lat",
    "gps_start_lon",
    "gps_end_lat",
    "gps_end_lon",
    "pickup_hour",
    "pickup_dayofweek"
]

# Feature matrix
X = df[features]

# Targets
y_class = df["high_risk"]          # classification
y_reg = df["maintenance_cost"]     # regression

X.head(), y_class.head(), y_reg.head() # Quickly preview the features and both target variables

## We checked dataset shapes and missing values.


In [None]:
# Check the size of our feature set and target variables
print("X shape:", X.shape)
print("y_class shape:", y_class.shape)
print("y_reg shape:", y_reg.shape)

# Check for missing values in selected columns
df[features + ["maintenance_cost", "high_risk"]].isna().sum()

## Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Split data for the classification task (predicting high-risk trips)
# We use stratify so the class balance stays the same in train and test sets.
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

# Split data for the regression task (predicting maintenance cost)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)
# Show the shapes to confirm splits happened correctly
print("Classification Train:", X_train_c.shape, "Test:", X_test_c.shape)
print("Regression Train:", X_train_r.shape, "Test:", X_test_r.shape)

## List all numeric feature names.

In [None]:
# Get a list of all numeric feature columns
numeric_features = X.columns.tolist()
numeric_features

## Scaling & Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Preprocessing pipeline for numeric features
preprocess_numeric = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

## Data Inspection

In [None]:
df.info() # Display basic information about the dataset

In [None]:
df.describe() # View summary statistics for numeric columns

## Classification Models & Results

## Building the Logistic Regression Classification Model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Build a pipeline that scales the data and trains a Logistic Regression model
clf_logreg = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

# Train the model on the classification training data
clf_logreg.fit(X_train_c, y_train_c)

# Make predictions on the test set
y_pred_logreg = clf_logreg.predict(X_test_c)

# Display evaluation results
print("Logistic Regression Results:\n")
print(classification_report(y_test_c, y_pred_logreg))
print("Confusion Matrix:")
print(confusion_matrix(y_test_c, y_pred_logreg))

## Building the Random Forest Classification Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Build a Random Forest pipeline (no scaling needed for tree models)
clf_rf = Pipeline(
    steps=[
        # Tree models don't strictly need scaling, so we skip StandardScaler here
        ("model", RandomForestClassifier(
            n_estimators=100,
            random_state=42
        ))
    ]
)

# Train the Random Forest on the classification data
clf_rf.fit(X_train_c, y_train_c)

# Make predictions on the test set
y_pred_rf = clf_rf.predict(X_test_c)

# Show performance results
print("=== Random Forest Results ===\n")
print(classification_report(y_test_c, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test_c, y_pred_rf))

## Balanced Logistic Regression Model (Handling Class Imbalance)

In [None]:
# Logistic Regression with class weights to handle class imbalance
clf_logreg_bal = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
    ]
)
# Train the balanced model
clf_logreg_bal.fit(X_train_c, y_train_c)

# Predict on the test data
y_pred_logreg_bal = clf_logreg_bal.predict(X_test_c)

# Show performance results
print("=== Balanced Logistic Regression Results ===\n")
print(classification_report(y_test_c, y_pred_logreg_bal))
print(confusion_matrix(y_test_c, y_pred_logreg_bal))

## Balanced Random Forest Model

In [None]:
# Random Forest with class weights to better handle imbalanced classes
clf_rf_bal = Pipeline(
    steps=[
        ("model", RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            class_weight="balanced"
        ))
    ]
)
# Train the balanced Random Forest model
clf_rf_bal.fit(X_train_c, y_train_c)

# Make predictions on the test set
y_pred_rf_bal = clf_rf_bal.predict(X_test_c)

# Show evaluation results
print("=== Balanced Random Forest Results ===\n")
print(classification_report(y_test_c, y_pred_rf_bal))
print(confusion_matrix(y_test_c, y_pred_rf_bal))

##  Regression Models & Results

## Linear Regression Model (Fuel Consumption Prediction)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Build a pipeline that scales the data and fits a Linear Regression model
reg_lin = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

# Train the regression model
reg_lin.fit(X_train_r, y_train_r)

# Make predictions on the test set
y_pred_lin = reg_lin.predict(X_test_r)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_r, y_pred_lin) # Average error
rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_lin)) # Root MSE
r2 = r2_score(y_test_r, y_pred_lin) # Variation explained

# Display results
print("=== Linear Regression Results ===")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

## Random Forest Regressor Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Build a Random Forest model inside a pipeline
reg_rf = Pipeline([
    ("model", RandomForestRegressor(
        n_estimators=200, # Number of trees
        random_state=42   # Keep results consistent
    ))
])
# Train the model on the regression training data
reg_rf.fit(X_train_r, y_train_r)

# Predict fuel/maintenance cost on the test set
y_pred_rf = reg_rf.predict(X_test_r)

# Compute evaluation metrics
mae_rf = mean_absolute_error(y_test_r, y_pred_rf) # Average prediction error
rmse_rf = np.sqrt(mean_squared_error(y_test_r, y_pred_rf)) # Root mean squared error
r2_rf = r2_score(y_test_r, y_pred_rf) # How much variance the model explains

# Display model performance
print("=== Random Forest Regressor Results ===")
print("MAE:", mae_rf)
print("RMSE:", rmse_rf)
print("R²:", r2_rf)

## Feature Selection for Regression

In [None]:
# Select the feature columns we want to use for the regression model
reg_feature_cols = [
    "distance_km",
    "fuel_cost",
    "driver_pay",
    "toll_cost",
    "load_value",
    "violation_count",
    "speeding_incidents",
    "gps_start_lat",
    "gps_start_lon",
    "gps_end_lat",
    "gps_end_lon",
    "pickup_hour",
    "pickup_dayofweek"
]

# Build the regression feature matrix using only the selected columns
X_reg = df[reg_feature_cols]

# Split data again (specifically for the regression task)
from sklearn.model_selection import train_test_split

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)
# Print shape to confirm everything looks correct
print(X_reg.shape)

## Linear Regression Model (Fuel Consumption Prediction)

In [None]:
# Build a pipeline that scales the data and applies Linear Regression
reg_lin = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])
# Train the model
reg_lin.fit(X_train_r, y_train_r)

# Predict on the test set
y_pred_lin = reg_lin.predict(X_test_r)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test_r, y_pred_lin)  # Average error
rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_lin)) # Root mean squared error
r2 = r2_score(y_test_r, y_pred_lin)  # How well the model explains variation

# Display results
print("=== Fixed Linear Regression Results ===")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

## Random Forest Regressor Model

In [None]:
# Build a pipeline that fits a Random Forest Regressor
reg_rf = Pipeline([
    ("model", RandomForestRegressor(
        n_estimators=200, # Number of trees in the forest
        random_state=42 # Keeps results consistent
    ))
])

# Train the Random Forest model
reg_rf.fit(X_train_r, y_train_r)
# Predict on the regression test set
y_pred_rf = reg_rf.predict(X_test_r)

# Compute evaluation metrics
mae_rf = mean_absolute_error(y_test_r, y_pred_rf) # Average prediction error
rmse_rf = np.sqrt(mean_squared_error(y_test_r, y_pred_rf)) # Root mean squared error
r2_rf = r2_score(y_test_r, y_pred_rf) # % of variance explained by the model

# Show model performance results
print("=== Fixed Random Forest Results ===")
print("MAE:", mae_rf)
print("RMSE:", rmse_rf)
print("R²:", r2_rf)

## Hyperparameter Tuning for Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV

# Search for the best Logistic Regression settings
param_grid_logreg = {
    "model__C": [0.01, 0.1, 1, 10], # Strength of regularization
    "model__solver": ["lbfgs", "liblinear"] # Optimization methods
}

# GridSearchCV tries every combination to find the best model
grid_logreg = GridSearchCV(
    clf_logreg_bal,    # Balanced Logistic Regression model
    param_grid_logreg,  # Parameters to test
    scoring="recall",   # We focus on catching delayed trips
    cv=5,               # 5-fold cross validation
    n_jobs=-1           # Use all CPU cores
)
# Train the tuning search
grid_logreg.fit(X_train_c, y_train_c)

# Print the best settings found
print("Best Logistic Regression Parameters:", grid_logreg.best_params_)
print("Best Cross-Validated Recall:", grid_logreg.best_score_)

## Hyperparameter Tuning for Random Forest Regressor

In [None]:
# Settings we want GridSearchCV to test for the Random Forest model
param_grid_rf = {
    "model__n_estimators": [50, 100, 200],  # Number of trees
    "model__max_depth": [None, 10, 20],     # How deep each tree can grow
}
# GridSearchCV tries different parameter combinations to find the best model
grid_rf = GridSearchCV(
    reg_rf,     # Our Random Forest pipeline
    param_grid_rf,  # Parameter grid to search
    scoring="neg_mean_absolute_error",  # We want the lowest MAE
    cv=5,                       # 5-fold cross validation
    n_jobs=-1       # Use all CPU cores
)
# Run the tuning process
grid_rf.fit(X_train_r, y_train_r)

# Print the best parameters and the best MAE score found
print("Best Random Forest Parameters:", grid_rf.best_params_)
print("Best CV MAE:", -grid_rf.best_score_) # Convert negative MAE back to positive

## Data Scaling for Deep Learning

In [None]:
from sklearn.preprocessing import StandardScaler

# Deep learning models work better when inputs are scaled
scaler_dl = StandardScaler()
# Fit the scaler on the training data and transform it
X_train_dl = scaler_dl.fit_transform(X_train_c)
# Apply the same scaling to the test data
X_test_dl = scaler_dl.transform(X_test_c)

## Building the Artificial Neural Network (ANN)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Build a simple neural network for classification
model = Sequential()
# First hidden layer with 32 neurons
model.add(Dense(32, activation="relu", input_shape=(X_train_dl.shape[1],)))
# Second hidden layer with 16 neurons
model.add(Dense(16, activation="relu"))
# Output layer with 1 neuron for binary classification
model.add(Dense(1, activation="sigmoid"))
# Compile the model with Adam optimizer and binary crossentropy loss
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
# Show the model structure
model.summary()

## Training the ANN Model

In [None]:
# Train the neural network model
history = model.fit(
    # Training data
    X_train_dl, y_train_c,
    # Validation during training
    validation_data=(X_test_dl, y_test_c),
    # Number of passes through the data
    epochs=30,
    # Samples per training batch
    batch_size=32,
    # Show training progress
    verbose=1
)

In [None]:
# Make predictions with the ANN model (convert probabilities to 0/1 classes)
y_pred_dl = (model.predict(X_test_dl) > 0.5).astype(int).flatten()

from sklearn.metrics import classification_report, confusion_matrix

# Display evaluation results for the deep learning model
print("=== Deep Learning Classification Results ===")
print(classification_report(y_test_c, y_pred_dl)) # Precision, recall, F1-score
print(confusion_matrix(y_test_c, y_pred_dl)) # Confusion matrix

## Evaluating the ANN Model

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Find the unique classes (0 = on-time, 1 = delayed)
classes = np.unique(y_train_c)
# Compute balanced weights so the model treats both classes fairly
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train_c
)
# Put the weights into a dictionary format used by Keras
class_weight_dict = {int(c): w for c, w in zip(classes, class_weights)}
# Show the class weights
class_weight_dict

## Training the ANN Model with Class Weights

In [None]:
# Train the ANN using class weights to handle imbalance
history = model.fit(
    # Training data
    X_train_dl, y_train_c,
    # Validation set
    validation_data=(X_test_dl, y_test_c),
    # Number of training cycles
    epochs=30,
    # Size of each training batch
    batch_size=32,
    # Apply class balancing
    class_weight=class_weight_dict,
    # Show training progress
    verbose=1
)

## Train–Test Split Confirmation (Classification)

In [None]:
# Show the size of the training and test sets for classification
print("Classification Train:", X_train_c.shape, "Test:", X_test_c.shape)