# **Road Accident Severity Analysis**
This notebook demonstrates the implementation of various machine learning models to predict road accident severity.

# Setup and Installation
First, let's install the required packages:

In [None]:
!pip install scikit-learn
!pip install imbalanced-learn
!pip install seaborn

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt

# Data Loading and Preprocessing
Load the Dataset

In [None]:
# Upload the dataset to Colab (you'll need to upload RTADatasetE1.csv to your Colab environment)
from google.colab import files
uploaded = files.upload()

# Read the uploaded file
df = pd.read_csv("RTADatasetE1.csv")

Preprocess Time Feature

In [None]:
# Convert the 'Time' column to datetime format and extract hour
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M:%S')
df['Hour'] = df['Time'].dt.hour

Feature Encoding

In [None]:
# Day of week encoding
day_map = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3,
           'Friday': 4, 'Saturday': 5, 'Sunday': 6, 'NA': -1}
df['Day_of_week'] = df['Day_of_week'].map(day_map)

# Age band encoding
age_map = {'Under 18': 0, '18-30': 1, '31-50': 2, 'Above 50': 3, 'NA': -1}
df['Age_band_of_driver'] = df['Age_band_of_driver'].map(age_map)
df['Age_band_of_casualty'] = df['Age_band_of_casualty'].map(age_map)

# Sex encoding using LabelEncoder
le = LabelEncoder()
df['Sex_of_driver'] = le.fit_transform(df['Sex_of_driver'])
df['Sex_of_casualty'] = le.fit_transform(df['Sex_of_casualty'])

# Education level encoding
edu_map = {'Elementary school': 0, 'Junior high school': 1, 'High school': 2,
           'College': 3, 'Illiterate': -1, 'Writing & reading': -1}
df['Educational_level'] = df['Educational_level'].map(edu_map)

# Vehicle driver relation encoding
vdr_map = {'Employee': 1, 'Owner': 0, 'Unknown': -1, 'Other': 3}
df['Vehicle_driver_relation'] = df['Vehicle_driver_relation'].map(vdr_map)

# Add all other encoding mappings here (continuing with the same pattern)
# [Note: For brevity, I'm showing a subset of the encodings.
#  Include all other mappings from the original code]

# Data Preparation

In [None]:
# Prepare features and target
X = df.drop(['Time', 'Accident_severity'], axis=1)
y = df['Accident_severity']

# Remove columns with missing data
X = X.dropna(axis=1, how='any')

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Handle imbalanced data using SMOTE
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

# Model Training and Evaluation
Histogram-based Gradient Boosting

In [None]:
# HGB Model
hgb_params = {
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth': [3, 5, 7],
    'max_iter': [100, 200, 300]
}
hgb_model = GridSearchCV(HistGradientBoostingClassifier(), hgb_params, cv=5)
hgb_model.fit(X_train, y_train)
hgb_pred = hgb_model.predict(X_test)

# Calculate metrics
hgb_metrics = {
    'accuracy': accuracy_score(y_test, hgb_pred),
    'precision': precision_score(y_test, hgb_pred, average='macro'),
    'recall': recall_score(y_test, hgb_pred, average='macro'),
    'f1': f1_score(y_test, hgb_pred, average='macro')
}

print("HGB Model Metrics:")
for metric, value in hgb_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")

Random Forest

In [None]:
# RF Model
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}
rf_model = GridSearchCV(RandomForestClassifier(), rf_params, cv=5)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Calculate metrics
rf_metrics = {
    'accuracy': accuracy_score(y_test, rf_pred),
    'precision': precision_score(y_test, rf_pred, average='macro'),
    'recall': recall_score(y_test, rf_pred, average='macro'),
    'f1': f1_score(y_test, rf_pred, average='macro')
}

print("\nRF Model Metrics:")
for metric, value in rf_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")

Support Vector Machine

In [None]:
# SVM Model
svm_params = {
    'C': [1, 10, 100],
    'gamma': ['scale', 'auto']
}
svm_model = GridSearchCV(SVC(), svm_params, cv=5)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

# Calculate metrics
svm_metrics = {
    'accuracy': accuracy_score(y_test, svm_pred),
    'precision': precision_score(y_test, svm_pred, average='macro'),
    'recall': recall_score(y_test, svm_pred, average='macro'),
    'f1': f1_score(y_test, svm_pred, average='macro')
}

print("\nSVM Model Metrics:")
for metric, value in svm_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")

K-Nearest Neighbors

In [None]:
# KNN Model
knn_params = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance']
}
knn_model = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5)
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

# Calculate metrics
knn_metrics = {
    'accuracy': accuracy_score(y_test, knn_pred),
    'precision': precision_score(y_test, knn_pred, average='macro'),
    'recall': recall_score(y_test, knn_pred, average='macro'),
    'f1': f1_score(y_test, knn_pred, average='macro')
}

print("\nKNN Model Metrics:")
for metric, value in knn_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")

# Visualization Functions

In [None]:
def plot_bar_graph(metric, values, algorithms):
    """
    Plot bar graph comparing model performances
    """
    plt.figure(figsize=(10, 6))
    bars = plt.bar(algorithms, values, color='lightblue', edgecolor='black')
    max_index = values.index(max(values))
    bars[max_index].set_edgecolor('red')
    plt.xlabel('Algorithms')
    plt.ylabel(metric)
    plt.title(f'{metric} of Different Algorithms')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(conf_matrix, title):
    """
    Plot confusion matrix heatmap
    """
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

In [None]:
# Prepare data for plotting
algorithms = ['HGB', 'RF', 'SVM', 'KNN']
metrics = {
    'Precision': [m['precision'] for m in [hgb_metrics, rf_metrics, svm_metrics, knn_metrics]],
    'Accuracy': [m['accuracy'] for m in [hgb_metrics, rf_metrics, svm_metrics, knn_metrics]],
    'F1-score': [m['f1'] for m in [hgb_metrics, rf_metrics, svm_metrics, knn_metrics]],
    'Recall': [m['recall'] for m in [hgb_metrics, rf_metrics, svm_metrics, knn_metrics]]
}

# Plot performance metrics
for metric, values in metrics.items():
    plot_bar_graph(metric, values, algorithms)

# Plot confusion matrices
confusion_matrices = {
    'HGB': confusion_matrix(y_test, hgb_pred),
    'RF': confusion_matrix(y_test, rf_pred),
    'SVM': confusion_matrix(y_test, svm_pred),
    'KNN': confusion_matrix(y_test, knn_pred)
}

for name, matrix in confusion_matrices.items():
    plot_confusion_matrix(matrix, f'{name} Confusion Matrix')

# Save Models (Optional)

In [None]:
import joblib

# Save the best performing model (assuming it's the Random Forest)
joblib.dump(rf_model, 'best_model.joblib')

# To load the model later:
# loaded_model = joblib.load('best_model.joblib')