In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
import joblib

In [2]:
df = pd.read_csv(r"C:\Users\joelf\Downloads\cleaned.csv")

# Data Cleaning: Dropping unnecessary columns and handling missing values
df_cleaned = df.drop(columns=['Unnamed: 0']).dropna()

In [3]:
df_cleaned.head()

Unnamed: 0,age_band_of_driver,vehicle_type,age_of_vehicle,weather_conditions,day_of_week,road_surface_conditions,light_conditions,sex_of_driver,season,speed_limit,accident_seriousness
0,16-25,Car,7.0,Fine no high winds,Tuesday,Wet or damp,Darkness - no lighting,Male,Winter,60.0,Fatal
1,26-45,Motorcycle,7.0,Fine no high winds,Thursday,Dry,Daylight,Male,Autumn,40.0,Fatal
2,26-45,Motorcycle,7.0,Fine no high winds,Saturday,Dry,Darkness - no lighting,Male,Summer,70.0,Fatal
3,46-65,Other Vehicle,7.0,Snowing no high winds,Tuesday,Snow,Darkness - lights lit,Male,Rainy,50.0,Fatal
4,46-65,Other Vehicle,7.0,Snowing no high winds,Tuesday,Snow,Darkness - lights lit,Male,Autumn,50.0,Fatal


In [4]:
label_encoders = {}
for column in df_cleaned.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df_cleaned[column] = le.fit_transform(df_cleaned[column])
    label_encoders[column] = le

# Splitting data into features and target
X = df_cleaned.drop(columns=['accident_seriousness'])
y = df_cleaned['accident_seriousness']

In [5]:
X

Unnamed: 0,age_band_of_driver,vehicle_type,age_of_vehicle,weather_conditions,day_of_week,road_surface_conditions,light_conditions,sex_of_driver,season,speed_limit
0,0,1,7.0,1,5,5,2,1,3,60.0
1,1,3,7.0,1,4,1,3,1,0,40.0
2,1,3,7.0,1,2,1,2,1,2,70.0
3,2,4,7.0,7,5,4,0,1,1,50.0
4,2,4,7.0,7,5,4,0,1,0,50.0
...,...,...,...,...,...,...,...,...,...,...
25917,0,1,1.0,1,6,1,3,1,1,30.0
25918,0,3,1.0,1,5,1,3,1,1,30.0
25919,0,3,1.0,1,4,1,3,1,0,30.0
25920,0,1,1.0,1,6,1,3,1,1,30.0


In [6]:
y

0        0
1        0
2        0
3        0
4        0
        ..
25917    1
25918    1
25919    1
25920    1
25921    1
Name: accident_seriousness, Length: 15149, dtype: int32

In [8]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Training the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Testing the model
y_pred = rf_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.36      0.45       388
           1       0.80      0.93      0.86      1369
           2       0.78      0.74      0.76      1273

    accuracy                           0.78      3030
   macro avg       0.73      0.67      0.69      3030
weighted avg       0.76      0.78      0.76      3030



In [10]:
model_path = 'accident_severity_model.pkl'
joblib.dump(rf_model, model_path)
print(f"Model saved to: {model_path}")

# Exporting the scaler for future use
scaler_path = 'scaler.pkl'
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to: {scaler_path}")

Model saved to: accident_severity_model.pkl
Scaler saved to: scaler.pkl


In [13]:
model_path = 'accident_severity_model.pkl'
scaler_path = 'scaler.pkl'
rf_model = joblib.load(model_path)
scaler = joblib.load(scaler_path)

# Load label encoders (if saved separately; otherwise, use them as defined in training)
label_encoders = {
    "age_band_of_driver": LabelEncoder().fit(["16-25", "26-45", "46-65", "65+", "Under 16"]),
    "vehicle_type": LabelEncoder().fit(["Car", "Motorcycle", "Other Vehicle"]),
    "weather_conditions": LabelEncoder().fit(["Fine no high winds", "Raining no high winds", "Snowing no high winds"]),
    "day_of_week": LabelEncoder().fit(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]),
    "road_surface_conditions": LabelEncoder().fit(["Dry", "Wet or damp", "Snow"]),
    "light_conditions": LabelEncoder().fit(["Daylight", "Darkness - no lighting", "Darkness - lights lit"]),
    "sex_of_driver": LabelEncoder().fit(["Male", "Female"]),
    "season": LabelEncoder().fit(["Winter", "Summer", "Rainy", "Autumn"]),
}

# New input data (in strings)
new_data = pd.DataFrame([{
    "age_band_of_driver": "65+",
    "vehicle_type": "Car",
    "age_of_vehicle": 100,
    "weather_conditions": "Fine no high winds",
    "day_of_week": "Friday",
    "road_surface_conditions": "Dry",
    "light_conditions": "Daylight",
    "sex_of_driver": "Female",
    "season": "Summer",
    "speed_limit": 20
}])

# Encode categorical columns using label encoders
for column, le in label_encoders.items():
    if column in new_data:
        new_data[column] = le.transform(new_data[column])

# Scale numerical features
new_data_scaled = scaler.transform(new_data)

# Predicting
prediction = rf_model.predict(new_data_scaled)
print("Predicted Accident Severity:", prediction)

Predicted Accident Severity: [0]
