In [38]:
!pip install pandas
!pip install numpy




[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [39]:
import pandas as pd
import numpy as np

def preprocess_for_xgboost(acc_path, cas_path, veh_path):
    # 1. Load data and remove the trailing empty rows found in these files
    acc = pd.read_csv('C:\\Users\\kunal\\OneDrive\\Desktop\\Coding\\ai-road-risk-intelligence\\backend\\archive (2)\\AccidentsBig.csv').dropna(subset=['Accident_Index'])
    cas = pd.read_csv('C:\\Users\\kunal\\OneDrive\\Desktop\\Coding\\ai-road-risk-intelligence\\backend\\archive (2)\\CasualtiesBig.csv').dropna(subset=['Accident_Index'])
    veh = pd.read_csv('C:\\Users\\kunal\\OneDrive\\Desktop\\Coding\\ai-road-risk-intelligence\\backend\\archive (2)\\VehiclesBig.csv').dropna(subset=['Accident_Index'])

    # 2. Aggregate Vehicle Data (Many Vehicles -> One Accident)
    # We summarize vehicle info so we don't duplicate accident rows during merge
    veh_agg = veh.groupby('Accident_Index').agg({
        'Vehicle_Type': lambda x: x.mode()[0] if not x.mode().empty else np.nan,
        'Age_of_Driver': 'mean',
        'Engine_Capacity_(CC)': 'mean',
        'Age_of_Vehicle': 'mean',
        'Sex_of_Driver': lambda x: (x == 1).sum() # Count of male drivers as a feature
    }).reset_index()
    
    veh_agg.rename(columns={
        'Vehicle_Type': 'Primary_Vehicle_Type',
        'Age_of_Driver': 'Avg_Driver_Age',
        'Engine_Capacity_(CC)': 'Avg_Engine_CC',
        'Age_of_Vehicle': 'Avg_Vehicle_Age',
        'Sex_of_Driver': 'Male_Driver_Count'
    }, inplace=True)

    # 3. Aggregate Casualty Data (Many Casualties -> One Accident)
    cas_agg = cas.groupby('Accident_Index').agg({
        'Age_of_Casualty': 'mean',
        'Sex_of_Casualty': 'mean'
    }).reset_index()
    cas_agg.rename(columns={'Age_of_Casualty': 'Avg_Casualty_Age'}, inplace=True)

    # 4. Merge all into the main Accidents table
    df = acc.merge(veh_agg, on='Accident_Index', how='left')
    df = df.merge(cas_agg, on='Accident_Index', how='left')

    # 5. Handle Missing Values: Convert '-1' to NaN
    # XGBoost handles NaNs natively by learning the best default branch direction
    df.replace(-1, np.nan, inplace=True)

    # 6. Feature Engineering: Time & Date
    df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')
    df['Month'] = df['Date'].dt.month
    df['Hour'] = pd.to_datetime(df['Time'], format='%H:%M', errors='coerce').dt.hour
    
    # 7. Drop non-numeric / ID / Leaky columns
    # We drop 'Casualty_Severity' (if present) because it leaks the answer for Accident_Severity
    cols_to_drop = ['Accident_Index', 'LSOA_of_Accident_Location', 
                    'Local_Authority_(Highway)', 'Time', 'Date']
    df_final = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

    # 8. Target Alignment
    # XGBoost multiclass requires labels to start at 0 (Original is 1, 2, 3)
    if 'Accident_Severity' in df_final.columns:
        df_final['Accident_Severity'] = df_final['Accident_Severity'] - 1

    return df_final

# Execute processing
df_train = preprocess_for_xgboost('AccidentsBig.csv', 'CasualtiesBig.csv', 'VehiclesBig.csv')

# Note: You can now pass df_train directly to xgb.DMatrix
print(f"Dataset ready for XGBoost. Shape: {df_train.shape}")


  acc = pd.read_csv('C:\\Users\\kunal\\OneDrive\\Desktop\\Coding\\ai-road-risk-intelligence\\backend\\archive (2)\\AccidentsBig.csv').dropna(subset=['Accident_Index'])


Dataset ready for XGBoost. Shape: (59998, 34)


In [40]:
df_train.head()

ModuleNotFoundError: No module named 'pandas.io.formats.string'

ModuleNotFoundError: No module named 'pandas.io.formats.html'

In [41]:
!pip install xgboost




[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [43]:
# Target column
y = df_train['Accident_Severity']

# Features
X = df_train.drop(columns=['Accident_Severity'])


In [44]:
X = pd.get_dummies(X, drop_first=True)


In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [46]:
model = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    tree_method='hist',
    random_state=42
)


In [47]:
model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=True
)


[0]	validation_0-mlogloss:0.42830
[1]	validation_0-mlogloss:0.42548
[2]	validation_0-mlogloss:0.42287
[3]	validation_0-mlogloss:0.42093
[4]	validation_0-mlogloss:0.41880
[5]	validation_0-mlogloss:0.41715
[6]	validation_0-mlogloss:0.41535
[7]	validation_0-mlogloss:0.41362
[8]	validation_0-mlogloss:0.41216
[9]	validation_0-mlogloss:0.41085
[10]	validation_0-mlogloss:0.40945
[11]	validation_0-mlogloss:0.40834
[12]	validation_0-mlogloss:0.40719
[13]	validation_0-mlogloss:0.40602
[14]	validation_0-mlogloss:0.40502
[15]	validation_0-mlogloss:0.40427
[16]	validation_0-mlogloss:0.40340
[17]	validation_0-mlogloss:0.40265
[18]	validation_0-mlogloss:0.40183
[19]	validation_0-mlogloss:0.40102
[20]	validation_0-mlogloss:0.40036
[21]	validation_0-mlogloss:0.39988
[22]	validation_0-mlogloss:0.39925
[23]	validation_0-mlogloss:0.39866
[24]	validation_0-mlogloss:0.39814
[25]	validation_0-mlogloss:0.39772
[26]	validation_0-mlogloss:0.39719
[27]	validation_0-mlogloss:0.39673
[28]	validation_0-mlogloss:0.3

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\kunal\\OneDrive\\Desktop\\Coding\\ai-road-risk-intelligence\\venv\\Lib\\site-packages\\xgboost\\VERSION'

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\kunal\\OneDrive\\Desktop\\Coding\\ai-road-risk-intelligence\\venv\\Lib\\site-packages\\xgboost\\VERSION'

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.8, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, feature_weights=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=300, n_jobs=None, num_class=3, ...)

In [48]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.867

Classification Report:

              precision    recall  f1-score   support

         0.0       1.00      0.01      0.02       129
         1.0       0.49      0.02      0.05      1476
         2.0       0.87      1.00      0.93     10395

    accuracy                           0.87     12000
   macro avg       0.79      0.34      0.33     12000
weighted avg       0.82      0.87      0.81     12000



In [49]:
print(confusion_matrix(y_test, y_pred))


[[    1     9   119]
 [    0    36  1440]
 [    0    28 10367]]


In [50]:
joblib.dump(model, "accident_risk_xgb_model.pkl")
joblib.dump(X.columns, "model_features.pkl")

['model_features.pkl']

In [51]:
!pip install "numpy<2.4"




[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [52]:
import shap
import pandas as pd
import numpy as np

explainer = shap.TreeExplainer(model)

sample = X_test.iloc[[0]]

# SHAP values
shap_values = explainer.shap_values(sample)

# Convert to numpy
shap_array = np.array(shap_values)

# Get predicted class
pred_class = model.predict(sample)[0]

# Select SHAP values for predicted class
shap_values_1d = np.abs(shap_array[0, :, pred_class])

print(len(X_test.columns), len(shap_values_1d))

# Create dataframe
shap_df = pd.DataFrame({
    "feature": X_test.columns,
    "impact": shap_values_1d
})

EXCLUDED_FEATURES=["lattitude", 'longitude']

top_factors = [
    f for f in shap_df.sort_values(
        by="impact",
        ascending=False
    )["feature"].tolist()
    if f not in EXCLUDED_FEATURES
][:5]

print(top_factors)


33 33
['Number_of_Vehicles', 'Avg_Driver_Age', 'Number_of_Casualties', 'Junction_Detail', 'Primary_Vehicle_Type']


In [53]:
print("Columns:", len(X_test.columns))
print("SHAP shape:", np.array(shap_values).shape)


Columns: 33
SHAP shape: (1, 33, 3)


In [54]:

explainer = shap.TreeExplainer(model)


In [55]:
df_train.to_csv("model/processed_dataset.csv", index=False)