In [None]:
# RTA Accident Severity Prediction â€“ Linear Regression

# --- 1. Library Imports ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import numpy as np
import os

# --- 2. Load the Dataset ---
# Ensure the file 'RTA Dataset.csv' is located in the 'sample_data' folder
file_path = 'sample_data/RTA Dataset.csv'
if not os.path.exists(file_path):
    print(f"Error: File not found at {file_path}. Please upload 'RTA Dataset.csv' to the 'sample_data' folder.")
else:
    df = pd.read_csv(file_path)

    print("Dataset loaded successfully.")
    print("Columns available:", df.columns.tolist())
    print(f"Shape: {df.shape}\n")

    # --- 3. Encode Target Variable ---
    # Convert Accident Severity from categorical to numeric
    severity_map = {'Slight Injury': 1, 'Serious Injury': 2, 'Fatal injury': 3}
    df['Accident_severity'] = df['Accident_severity'].map(severity_map)
    df.dropna(subset=['Accident_severity'], inplace=True)

    # --- 4. Drop Irrelevant or Missing Columns ---
    cols_to_drop = [
    # Original list
    'Time', 'Day_of_week', 'Sex_of_driver', 'Vehicle_type', 'Casualty_severity',
    'Lanes_or_Medians', 'Work_of_casuality', 'Service_year_of_vehicle',
    'Defect_of_vehicle', 'Number_of_casualties', 'Road_surface_conditions',
    'Junction_Control', 'Type_of_collision', 'Casualty_class',
    'Casualty_sex', 'Casualty_age_band',

    # NEW ADDITIONS: These are the columns that caused the ValueError
    # because they were neither dropped nor one-hot encoded.
    'Age_band_of_driver',
    'Educational_level',
    'Type_of_vehicle',
    'Owner_of_vehicle',
    'Area_accident_occured',
    'Road_allignment',
    'Types_of_Junction',
    'Road_surface_type',
    'Weather_conditions',
    'Vehicle_movement',
    'Sex_of_casualty',
    'Fitness_of_casuality',
    'Pedestrian_movement',
    'Cause_of_accident'
]
    df.drop(columns=cols_to_drop, errors='ignore', inplace=True)

    # Fill remaining missing values with mode (after dropping most columns)
    df.fillna(df.mode().iloc[0], inplace=True)

    # --- 5. Create Age Category Based on 'Age_band_of_casualty' ---
    def categorize_age(age):
        if age in ['Under 18', '18-30']:
            return 'Age_Young'
        elif age in ['31-50', '51-64']:
            return 'Age_Adult'
        else:
            return 'Age_Old'  # Handles 65+ and unknown cases

    # Check if 'Age_band_of_casualty' exists after initial drop
    if 'Age_band_of_casualty' in df.columns:
        df['Age_Category'] = df['Age_band_of_casualty'].apply(categorize_age)
        df.drop('Age_band_of_casualty', axis=1, inplace=True)
    else:
        # If it was dropped, we need to adjust categorical_cols in the next step
        print("Warning: 'Age_band_of_casualty' column not found for categorization.")

    # --- 6. Feature Selection ---
    # Ensure all selected features are present after drops/creations
    all_categorical_cols = [
    'Age_band_of_driver', 'Educational_level', 'Vehicle_driver_relation',
    'Driving_experience', 'Type_of_vehicle', 'Owner_of_vehicle',
    'Area_accident_occured', 'Road_allignment', 'Types_of_Junction',
    'Road_surface_type', 'Light_conditions', 'Weather_conditions',
    'Vehicle_movement', 'Sex_of_casualty', 'Fitness_of_casuality',
    'Pedestrian_movement', 'Cause_of_accident', 'Age_Category'
]
    numerical_cols = ['Number_of_vehicles_involved']

    # Check for missing selected columns and drop them from the lists if missing
    all_cols = categorical_cols + numerical_cols + ['Accident_severity']
    for col in all_cols:
        if col not in df.columns:
            print(f"Warning: Selected column '{col}' is missing from the DataFrame.")
            if col in categorical_cols: categorical_cols.remove(col)
            if col in numerical_cols: numerical_cols.remove(col)

    X = df.drop('Accident_severity', axis=1, errors='ignore')
    Y = df['Accident_severity']

    # --- 7. One-Hot Encoding ---
    # Perform one-hot encoding on remaining categorical columns
    X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
    X_features = X_encoded.columns.tolist() # Save feature names

    # --- 8. Scale Numerical Features ---
    scaler = StandardScaler()
    # Check if any numerical columns exist before scaling
    if numerical_cols:
        X_encoded[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])

    # --- 9. Train/Test Split ---
    X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y, test_size=0.2, random_state=42)

    # --- 10. Train Model ---
    model = LinearRegression()
    model.fit(X_train, Y_train)

    # --- 11. Evaluate Model ---
    Y_pred = model.predict(X_test)
    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)

    print("\n" + "="*30)
    print("Model Training Complete")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R-squared Score: {r2:.4f}")
    print("="*30 + "\n")

    # --- 12. Save Model and Artifacts ---
    model_path = '/content/accident_severity_linear_model.joblib'
    features_path = '/content/model_features.joblib'
    scaler_path = '/content/model_scaler.joblib'

    joblib.dump(model, model_path)
    joblib.dump(X_features, features_path)
    joblib.dump(scaler, scaler_path)

    print("Model, features list, and scaler saved in /content directory.")
    print("Next step: you can load and test predictions using the saved files.")

Dataset loaded successfully.
Columns available: ['Time', 'Day_of_week', 'Age_band_of_driver', 'Sex_of_driver', 'Educational_level', 'Vehicle_driver_relation', 'Driving_experience', 'Type_of_vehicle', 'Owner_of_vehicle', 'Service_year_of_vehicle', 'Defect_of_vehicle', 'Area_accident_occured', 'Lanes_or_Medians', 'Road_allignment', 'Types_of_Junction', 'Road_surface_type', 'Road_surface_conditions', 'Light_conditions', 'Weather_conditions', 'Type_of_collision', 'Number_of_vehicles_involved', 'Number_of_casualties', 'Vehicle_movement', 'Casualty_class', 'Sex_of_casualty', 'Age_band_of_casualty', 'Casualty_severity', 'Work_of_casuality', 'Fitness_of_casuality', 'Pedestrian_movement', 'Cause_of_accident', 'Accident_severity']
Shape: (12316, 32)


Model Training Complete
Mean Squared Error (MSE): 0.1741
R-squared Score: 0.0106

Model, features list, and scaler saved in /content directory.
Next step: you can load and test predictions using the saved files.


In [None]:
# Tool Code Block 2: Prediction Example

# Load the saved components
import joblib
import pandas as pd
import numpy as np

hypothetical_data = pd.DataFrame({
    'Number_of_vehicles_involved': [2],
    'Driving_experience_1-2yr': [1],
    'Driving_experience_2-5yr': [0],
    # ... other Driving_experience columns set to 0
    'Vehicle_driver_relation_Owner': [1],
    # ... other Vehicle_driver_relation columns set to 0
    'Light_conditions_Darkness - lights unlit': [0],
    'Light_conditions_Darkness - lights lit': [0],
    'Light_conditions_Daylight': [1],
    'Age_Category_Age_Old': [0],
    'Age_Category_Age_Young': [0] # Implies Age_Adult is the baseline
})

# Align columns and ensure all required features are present (set missing encoded cols to 0)
for col in X_features:
    if col not in hypothetical_data.columns:
        hypothetical_data[col] = 0
hypothetical_data = hypothetical_data[X_features]

# Scale numerical columns
# *** FIX: Changed 'Total_vehicles' to 'Number_of_vehicles_involved' ***
numerical_cols = ['Number_of_vehicles_involved']
hypothetical_data[numerical_cols] = loaded_scaler.transform(hypothetical_data[numerical_cols])

# Make the prediction
predicted_severity = loaded_model.predict(hypothetical_data)

print(f"Predicted Accident Severity Score: {predicted_severity[0]:.2f}")

Predicted Accident Severity Score: 1.16
