<a href="https://colab.research.google.com/github/VarunSKumar94/Data-science-and-AI-ML/blob/main/randomForestBrakeApplication/Braking_Prediction_Model_(Random_Forest_with_GridSearchCV).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# braking_prediction_model.ipynb

# --- 1. Setup and Imports ---
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from collections import Counter
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    recall_score,
    precision_score,
    f1_score,
    make_scorer
)

# --- 2. Synthetic Data Generation ---
# This simulates vehicle telematics data for demonstration purposes.
# In a real scenario, you would load your actual data here.

print("--- Generating Synthetic Data ---")
np.random.seed(42)

# Number of vehicles and trips
num_vins = 5
num_trips_per_vin = 3
records_per_trip = 500 # Number of data points per trip

all_data = []
for i in range(num_vins):
    vin = f'V{i+1:03d}'
    for j in range(num_trips_per_vin):
        trip_id = j + 1
        start_time = datetime(2023, 1, 1, np.random.randint(0, 23), np.random.randint(0, 59))

        for k in range(records_per_trip):
            current_time = start_time + timedelta(seconds=k * np.random.uniform(0.9, 1.1))

            # Simulate vehicle speed (km/h)
            speed = max(0, 60 + 15 * np.sin(k / 50) + np.random.randn() * 10)

            # Simulate brake events (imbalanced)
            # Roughly 10-15% of brake events
            brake_pressed = 0
            if np.random.rand() < 0.1: # Base probability
                # Increase probability if speed is high and decelerating
                if k > 0 and speed < all_data[-1][3] * 0.9: # If current speed is significantly lower than previous
                    if np.random.rand() < 0.7:
                        brake_pressed = 1
                elif np.random.rand() < 0.05: # Small chance even without sharp deceleration
                    brake_pressed = 1

            all_data.append([vin, trip_id, current_time, speed, brake_pressed])

df = pd.DataFrame(all_data, columns=['VIN', 'Trips', 'GPS_TimeStamp_Local', 'VhclSpd', 'Brake_Pressed'])
print(f"Synthetic data generated: {df.shape[0]} rows.")
print("Sample of raw data:")
print(df.head())

# --- 3. Feature Engineering ---
# This section implements the feature engineering steps discussed.
# It uses Pandas operations, assuming data fits in memory for this script.
# For large datasets, these steps would be adapted to PySpark as discussed previously.

print("\n--- Starting Feature Engineering ---")

# Define max lags and leads for feature generation
max_spd_lags = 12
max_acc_lags = 2
max_spd_leads = 5
max_acc_leads = 3

def apply_all_feature_engineering(df_group, spd_col, time_col, max_spd_lags, max_acc_lags, max_spd_leads, max_acc_leads):
    """
    Applies all feature engineering steps to a single group (e.g., a single trip).
    Calculates acceleration, and then generates lagged and lead features for both speed and acceleration.
    """
    df_group = df_group.sort_values(by=time_col).copy()

    # Calculate acceleration from speed
    df_group['timediff'] = df_group[time_col].diff().dt.total_seconds()
    # Convert speed diff from km/h to m/s, then divide by timediff and 9.81 m/s^2 for g's
    df_group['acc_from_spd_g'] = (df_group[spd_col].diff() * 1000 / 3600) / (df_group['timediff'] * 9.81)
    # Handle potential division by zero or infinite values
    df_group['acc_from_spd_g'] = df_group['acc_from_spd_g'].replace([np.inf, -np.inf], np.nan)

    # Add lagged and lead features for VhclSpd
    for i in range(1, max_spd_lags + 1):
        df_group[f'{spd_col}_lag_{i}'] = df_group[spd_col].shift(i)
    for i in range(1, max_spd_leads + 1):
        df_group[f'{spd_col}_lead_{i}'] = df_group[spd_col].shift(-i) # Negative shift for leads

    # Add lagged and lead features for acc_from_spd_g
    # Ensure 'acc_from_spd_g' column exists before trying to shift it
    if 'acc_from_spd_g' in df_group.columns:
        for i in range(1, max_acc_lags + 1):
            df_group[f'acc_from_spd_g_lag{i}'] = df_group['acc_from_spd_g'].shift(i)
        for i in range(1, max_acc_leads + 1):
            df_group[f'acc_from_spd_g_lead{i}'] = df_group['acc_from_spd_g'].shift(-i)
    else:
        # If 'acc_from_spd_g' was not created (e.g., group too short),
        # create NaN columns for its lags/leads to avoid KeyError later
        for i in range(1, max_acc_lags + 1):
            df_group[f'acc_from_spd_g_lag{i}'] = np.nan
        for i in range(1, max_acc_leads + 1):
            df_group[f'acc_from_spd_g_lead{i}'] = np.nan

    return df_group

# Apply all feature engineering grouped by VIN and Trips in one go
df_engineered = df.groupby(['VIN', 'Trips'], group_keys=False).apply(
    lambda x: apply_all_feature_engineering(x, 'VhclSpd', 'GPS_TimeStamp_Local',
                                            max_spd_lags, max_acc_lags, max_spd_leads, max_acc_leads)
)
print(df_engineered.columns)
print("Feature engineering complete.")
print("Sample of engineered data (showing some new columns):")
print(df_engineered[['VhclSpd', 'acc_from_spd_g', 'VhclSpd_lag_1', 'VhclSpd_lead_1', 'acc_from_spd_g_lag1', 'acc_from_spd_g_lead1']].head())


# Define feature columns for the model
feature_cols = [
    'VhclSpd', 'acc_from_spd_g'
]
# Add all generated lag and lead columns
for i in range(1, max_spd_lags + 1):
    feature_cols.append(f'VhclSpd_lag_{i}')
for i in range(1, max_acc_lags + 1):
    feature_cols.append(f'acc_from_spd_g_lag{i}')
for i in range(1, max_spd_leads + 1):
    feature_cols.append(f'VhclSpd_lead_{i}')
for i in range(1, max_acc_leads + 1):
    feature_cols.append(f'acc_from_spd_g_lead{i}')

# Drop rows with any NaN values in the feature columns (due to lags/leads at group boundaries)
df_final = df_engineered.dropna(subset=feature_cols).reset_index(drop=True)
print(f"\nData after dropping NaNs: {df_final.shape[0]} rows.")

# Separate features (X) and target (y)
X = df_final[feature_cols]
y = df_final['Brake_Pressed']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
print(f"Training target distribution: {Counter(y_train)}")
print(f"Test target distribution: {Counter(y_test)}")


# --- 4. Data Scaling ---
# Initialize and FIT the scaler on your TRAINING data
print("\n--- Starting Data Scaling ---")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# Transform the test data using the *same* fitted scaler
X_test_scaled = scaler.transform(X_test)
print("Data scaling complete.")


# --- 5. Model Training & Hyperparameter Tuning (Random Forest with GridSearchCV) ---
print("\n--- Starting Model Training & Hyperparameter Tuning ---")

# Define custom scorer for class 1 recall
recall_class_1_scorer = make_scorer(recall_score, pos_label=1)

scoring_metrics = {
    'accuracy': 'accuracy',
    'recall_class_1': recall_class_1_scorer,
    'f1_score': 'f1' # Adding F1-score for comprehensive evaluation
}

# Initialize the RandomForestClassifier model
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced') # Use class_weight for imbalance

# Define the parameter grid for GridSearchCV
# This grid is a starting point and can be expanded or refined based on results.
param_grid_rf = {
    'n_estimators': [50, 100, 150], # Number of trees
    'max_depth': [5, 10, 15],       # Max depth of each tree
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', None] # Number of features to consider at each split
}

# Initialize GridSearchCV
grid_search_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid_rf,
    cv=3, # Reduced to 3 folds for faster demonstration, consider 5 or more for production
    scoring=scoring_metrics,
    refit='recall_class_1', # Refit the best model based on recall for class 1
    verbose=2, # Detailed output during search
    n_jobs=-1 # Use all available CPU cores for faster computation
)

print("Starting GridSearchCV for Random Forest...")
grid_search_rf.fit(X_train_scaled, y_train)
print("GridSearchCV complete.")

# Get the best parameters and the best score
print(f"\nBest parameters found: {grid_search_rf.best_params_}")
print(f"Best cross-validation 'recall_class_1' score: {grid_search_rf.best_score_:.4f}")

# Store the best estimator
best_rf_model_tuned = grid_search_rf.best_estimator_

# --- 6. Model Evaluation ---
print("\n--- Evaluating Best Model on Test Set ---")

y_pred_proba_rf = best_rf_model_tuned.predict_proba(X_test_scaled)[:, 1] # Probabilities for class 1.0

# Evaluate different prediction thresholds
thresholds = np.arange(0.1, 0.9, 0.05)
best_recall_target = 0 # To track the best recall achieved
best_threshold_found = 0.5 # Default threshold
best_precision_at_threshold = 0

print("\nEvaluating different prediction thresholds:")
for t in thresholds:
    y_pred_tuned_threshold = (y_pred_proba_rf >= t).astype(int)
    current_recall = recall_score(y_test, y_pred_tuned_threshold, pos_label=1)
    current_precision = precision_score(y_test, y_pred_tuned_threshold, pos_label=1)
    current_f1 = f1_score(y_test, y_pred_tuned_threshold, pos_label=1)
    current_accuracy = accuracy_score(y_test, y_pred_tuned_threshold)

    print(f"Threshold: {t:.2f} | Recall (1): {current_recall:.4f} | Precision (1): {current_precision:.4f} | F1 (1): {current_f1:.4f} | Accuracy: {current_accuracy:.4f}")

    # Logic to select the best threshold: prioritize recall >= 0.85, then maximize precision/F1
    if current_recall >= 0.85: # Your target recall
        if current_precision > best_precision_at_threshold: # Maximize precision among those meeting recall target
            best_recall_target = current_recall
            best_precision_at_threshold = current_precision
            best_threshold_found = t
    elif current_recall > best_recall_target and best_recall_target < 0.85: # If not yet at target, just find highest recall
        best_recall_target = current_recall
        best_precision_at_threshold = current_precision
        best_threshold_found = t


print(f"\nOptimal Threshold for Recall >= 0.85: {best_threshold_found:.2f}")
print(f"Achieved Recall (Class 1) at this threshold: {recall_score(y_test, (y_pred_proba_rf >= best_threshold_found).astype(int), pos_label=1):.4f}")
print(f"Achieved Precision (Class 1) at this threshold: {precision_score(y_test, (y_pred_proba_rf >= best_threshold_found).astype(int), pos_label=1):.4f}")
print(f"Achieved F1-Score (Class 1) at this threshold: {f1_score(y_test, (y_pred_proba_rf >= best_threshold_found).astype(int), pos_label=1):.4f}")

final_y_pred_test = (y_pred_proba_rf >= best_threshold_found).astype(int)
print("\nFinal Classification Report on Test Set:")
print(confusion_matrix(y_test, final_y_pred_test))
print(f"Final Accuracy: {accuracy_score(y_test, final_y_pred_test):.4f}")


# --- 7. Model and Scaler Serialization ---
print("\n--- Saving Model and Scaler ---")
# Save both the trained model and the FITTED scaler
with open('random_forest_model.pkl', 'wb') as model_file:
    pickle.dump(best_rf_model_tuned, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file) # Save the FITTED scaler!

print("Model and Scaler saved as 'random_forest_model.pkl' and 'scaler.pkl'.")


# --- 8. Prediction on New Unseen Data (Deployment Simulation) ---
print("\n--- Simulating Prediction on New Unseen Data ---")

# Create a dummy new unseen DataFrame (similar structure to original raw data)
new_unseen_data = []
new_vin = 'V999'
new_trip_id = 1
new_start_time = datetime(2024, 6, 1, 10, 0)
for k in range(100): # 100 new data points
    current_time = new_start_time + timedelta(seconds=k * np.random.uniform(0.9, 1.1))
    speed = max(0, 70 + 10 * np.cos(k / 20) + np.random.randn() * 5)
    new_unseen_data.append([new_vin, new_trip_id, current_time, speed, 0]) # Brake_Pressed is unknown initially

new_unseen_df_raw = pd.DataFrame(new_unseen_data, columns=['VIN', 'Trips', 'GPS_TimeStamp_Local', 'VhclSpd', 'Brake_Pressed_Actual'])

print("New unseen raw data sample:")
print(new_unseen_df_raw.head())

# Apply the same feature engineering steps to the new unseen data
new_unseen_df_fe = new_unseen_df_raw.groupby(['VIN', 'Trips'], group_keys=False).apply(
    lambda x: apply_all_feature_engineering(x, 'VhclSpd', 'GPS_TimeStamp_Local',
                                            max_spd_lags, max_acc_lags, max_spd_leads, max_acc_leads)
)


# Select only the feature columns and drop NaNs
X_new_unseen_features = new_unseen_df_fe[feature_cols].dropna().reset_index(drop=True)
original_indices_for_mapping = X_new_unseen_features.index # Keep original indices to map back

# Load the saved model and scaler
with open('random_forest_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)
with open('scaler.pkl', 'rb') as scaler_file:
    loaded_scaler = pickle.load(scaler_file)

# Scale the new unseen data using the *loaded* scaler
X_new_unseen_scaled = loaded_scaler.transform(X_new_unseen_features)

# Get probabilities from the loaded model
y_pred_proba_new_unseen = loaded_model.predict_proba(X_new_unseen_scaled)[:, 1]

# Apply the optimal threshold found during training
y_pred_new_unseen = (y_pred_proba_new_unseen >= best_threshold_found).astype(int)

# Map predictions back to a DataFrame with original identifiers
predictions_output = pd.DataFrame({
    'VIN': new_unseen_df_fe.loc[X_new_unseen_features.index, 'VIN'],
    'Trips': new_unseen_df_fe.loc[X_new_unseen_features.index, 'Trips'],
    'GPS_TimeStamp_Local': new_unseen_df_fe.loc[X_new_unseen_features.index, 'GPS_TimeStamp_Local'],
    'Brake_Predicted': y_pred_new_unseen,
    'Brake_Probability_Class1': y_pred_proba_new_unseen
})

print("\nNew Unseen Data with Predictions:")
print(predictions_output.head())
print(f"\nPredicted Brake_Pressed counts for new data: {Counter(predictions_output['Brake_Predicted'])}")