# Fleet Analytics & Prediction System

Author: Christopher F. Ogbechie  
Course: ANLT202  

## Project Objectives
This notebook prepares the dataset for:
- Trip delay risk classification
- Maintenance cost regression modeling

## Dataset
Source: Kaggle – Fleet Dataset  
File: fleet_dummy_5000.csv

## Classification Task
Predict whether a trip is operationally risky or problematic.

We define a trip as High Risk (1) if the status indicates a delay or problem
(e.g., "Delayed", "Cancelled", "Failed"). Otherwise, it is Low Risk (0).

This helps the fleet manager identify trips likely to cause service issues.

## Regression Task
Predict the maintenance_cost of a vehicle/trip based on:
- distance_km
- vehicle age / type
- driving behaviour (violations, speeding incidents)
- fuel_cost, toll_cost, load_value
- weather_condition and route information

## Load Dataset

In [None]:
import pandas as pd

df = pd.read_csv("../data/fleet_dummy_5000.csv")

df.head()

In [None]:
df["status"].value_counts()

## Define Classification Label

In [None]:
# Normalize status to lowercase
df["status_lower"] = df["status"].str.lower()

# Define high risk trips (only Delayed)
df["high_risk"] = (df["status_lower"] == "delayed").astype(int)

# Check result
df["high_risk"].value_counts()

## Define Features and Targets

In [None]:
# Convert pickup_time to datetime
df["pickup_time"] = pd.to_datetime(df["pickup_time"])

# Add time-based features
df["pickup_hour"] = df["pickup_time"].dt.hour
df["pickup_dayofweek"] = df["pickup_time"].dt.dayofweek

# Feature columns
features = [
    "distance_km",
    "fuel_cost",
    "driver_pay",
    "toll_cost",
    "load_value",
    "violation_count",
    "speeding_incidents",
    "gps_start_lat",
    "gps_start_lon",
    "gps_end_lat",
    "gps_end_lon",
    "pickup_hour",
    "pickup_dayofweek"
]

# Feature matrix
X = df[features]

# Targets
y_class = df["high_risk"]          # classification
y_reg = df["maintenance_cost"]     # regression

X.head(), y_class.head(), y_reg.head()

In [None]:
print("X shape:", X.shape)
print("y_class shape:", y_class.shape)
print("y_reg shape:", y_reg.shape)

df[features + ["maintenance_cost", "high_risk"]].isna().sum()

## Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Classification split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

# Regression split
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

print("Classification Train:", X_train_c.shape, "Test:", X_test_c.shape)
print("Regression Train:", X_train_r.shape, "Test:", X_test_r.shape)

In [None]:
numeric_features = X.columns.tolist()
numeric_features

## Scaling & Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Preprocessing pipeline for numeric features
preprocess_numeric = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)

## Data Inspection

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Classification pipeline
clf_logreg = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

# Train
clf_logreg.fit(X_train_c, y_train_c)

# Predict
y_pred_logreg = clf_logreg.predict(X_test_c)

# Evaluate
print("Logistic Regression Results:\n")
print(classification_report(y_test_c, y_pred_logreg))
print("Confusion Matrix:")
print(confusion_matrix(y_test_c, y_pred_logreg))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest classification pipeline
clf_rf = Pipeline(
    steps=[
        # Tree models don't strictly need scaling, so we skip StandardScaler here
        ("model", RandomForestClassifier(
            n_estimators=100,
            random_state=42
        ))
    ]
)

# Train the model
clf_rf.fit(X_train_c, y_train_c)

# Predict on test set
y_pred_rf = clf_rf.predict(X_test_c)

# Evaluate
print("=== Random Forest Results ===\n")
print(classification_report(y_test_c, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test_c, y_pred_rf))

In [None]:
clf_logreg_bal = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
    ]
)

clf_logreg_bal.fit(X_train_c, y_train_c)

y_pred_logreg_bal = clf_logreg_bal.predict(X_test_c)

print("=== Balanced Logistic Regression Results ===\n")
print(classification_report(y_test_c, y_pred_logreg_bal))
print(confusion_matrix(y_test_c, y_pred_logreg_bal))

In [None]:
clf_rf_bal = Pipeline(
    steps=[
        ("model", RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            class_weight="balanced"
        ))
    ]
)

clf_rf_bal.fit(X_train_c, y_train_c)

y_pred_rf_bal = clf_rf_bal.predict(X_test_c)

print("=== Balanced Random Forest Results ===\n")
print(classification_report(y_test_c, y_pred_rf_bal))
print(confusion_matrix(y_test_c, y_pred_rf_bal))

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Linear Regression pipeline
reg_lin = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

# Train
reg_lin.fit(X_train_r, y_train_r)

# Predict
y_pred_lin = reg_lin.predict(X_test_r)

# Evaluate
mae = mean_absolute_error(y_test_r, y_pred_lin)
rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_lin))
r2 = r2_score(y_test_r, y_pred_lin)

print("=== Linear Regression Results ===")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

In [None]:
from sklearn.ensemble import RandomForestRegressor

reg_rf = Pipeline([
    ("model", RandomForestRegressor(
        n_estimators=200,
        random_state=42
    ))
])

reg_rf.fit(X_train_r, y_train_r)
y_pred_rf = reg_rf.predict(X_test_r)

mae_rf = mean_absolute_error(y_test_r, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test_r, y_pred_rf))
r2_rf = r2_score(y_test_r, y_pred_rf)

print("=== Random Forest Regressor Results ===")
print("MAE:", mae_rf)
print("RMSE:", rmse_rf)
print("R²:", r2_rf)

In [None]:
reg_feature_cols = [
    "distance_km",
    "fuel_cost",
    "driver_pay",
    "toll_cost",
    "load_value",
    "violation_count",
    "speeding_incidents",
    "gps_start_lat",
    "gps_start_lon",
    "gps_end_lat",
    "gps_end_lon",
    "pickup_hour",
    "pickup_dayofweek"
]

# Rebuild regression feature matrix
X_reg = df[reg_feature_cols]

# Split again
from sklearn.model_selection import train_test_split

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

print(X_reg.shape)

In [None]:
reg_lin = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

reg_lin.fit(X_train_r, y_train_r)

y_pred_lin = reg_lin.predict(X_test_r)

mae = mean_absolute_error(y_test_r, y_pred_lin)
rmse = np.sqrt(mean_squared_error(y_test_r, y_pred_lin))
r2 = r2_score(y_test_r, y_pred_lin)

print("=== Fixed Linear Regression Results ===")
print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

In [None]:
reg_rf = Pipeline([
    ("model", RandomForestRegressor(
        n_estimators=200,
        random_state=42
    ))
])

reg_rf.fit(X_train_r, y_train_r)

y_pred_rf = reg_rf.predict(X_test_r)

mae_rf = mean_absolute_error(y_test_r, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test_r, y_pred_rf))
r2_rf = r2_score(y_test_r, y_pred_rf)

print("=== Fixed Random Forest Results ===")
print("MAE:", mae_rf)
print("RMSE:", rmse_rf)
print("R²:", r2_rf)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_logreg = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__solver": ["lbfgs", "liblinear"]
}

grid_logreg = GridSearchCV(
    clf_logreg_bal,    # use your working balanced classifier
    param_grid_logreg,
    scoring="recall",
    cv=5,
    n_jobs=-1
)

grid_logreg.fit(X_train_c, y_train_c)

print("Best Logistic Regression Parameters:", grid_logreg.best_params_)
print("Best Cross-Validated Recall:", grid_logreg.best_score_)

In [None]:
param_grid_rf = {
    "model__n_estimators": [50, 100, 200],
    "model__max_depth": [None, 10, 20],
}

grid_rf = GridSearchCV(
    reg_rf,
    param_grid_rf,
    scoring="neg_mean_absolute_error",
    cv=5,
    n_jobs=-1
)

grid_rf.fit(X_train_r, y_train_r)

print("Best Random Forest Parameters:", grid_rf.best_params_)
print("Best CV MAE:", -grid_rf.best_score_)

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale data (DL models need scaled input)
scaler_dl = StandardScaler()
X_train_dl = scaler_dl.fit_transform(X_train_c)
X_test_dl = scaler_dl.transform(X_test_c)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(Dense(32, activation="relu", input_shape=(X_train_dl.shape[1],)))
model.add(Dense(16, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

model.summary()

In [None]:
history = model.fit(
    X_train_dl, y_train_c,
    validation_data=(X_test_dl, y_test_c),
    epochs=30,
    batch_size=32,
    verbose=1
)

In [None]:
y_pred_dl = (model.predict(X_test_dl) > 0.5).astype(int).flatten()

from sklearn.metrics import classification_report, confusion_matrix

print("=== Deep Learning Classification Results ===")
print(classification_report(y_test_c, y_pred_dl))
print(confusion_matrix(y_test_c, y_pred_dl))

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights for 0 and 1
classes = np.unique(y_train_c)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train_c
)

class_weight_dict = {int(c): w for c, w in zip(classes, class_weights)}
class_weight_dict

In [None]:
history = model.fit(
    X_train_dl, y_train_c,
    validation_data=(X_test_dl, y_test_c),
    epochs=30,
    batch_size=32,
    class_weight=class_weight_dict,
    verbose=1
)

In [None]:
print("Classification Train:", X_train_c.shape, "Test:", X_test_c.shape)