In [2]:
!pip install pandas
!pip install numpy




[notice] A new release of pip is available: 24.0 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np

def preprocess_for_xgboost(acc_path, cas_path, veh_path):
    # 1. Load data and remove the trailing empty rows found in these files
    acc = pd.read_csv('C:\\Users\\VEDANTI\\OneDrive\\Desktop\\projects\\techathon\\archive (2)\\AccidentsBig.csv').dropna(subset=['Accident_Index'])
    cas = pd.read_csv('C:\\Users\\VEDANTI\\OneDrive\\Desktop\\projects\\techathon\\archive (2)\\CasualtiesBig.csv').dropna(subset=['Accident_Index'])
    veh = pd.read_csv('C:\\Users\\VEDANTI\\OneDrive\\Desktop\\projects\\techathon\\archive (2)\VehiclesBig.csv').dropna(subset=['Accident_Index'])

    # 2. Aggregate Vehicle Data (Many Vehicles -> One Accident)
    # We summarize vehicle info so we don't duplicate accident rows during merge
    veh_agg = veh.groupby('Accident_Index').agg({
        'Vehicle_Type': lambda x: x.mode()[0] if not x.mode().empty else np.nan,
        'Age_of_Driver': 'mean',
        'Engine_Capacity_(CC)': 'mean',
        'Age_of_Vehicle': 'mean',
        'Sex_of_Driver': lambda x: (x == 1).sum() # Count of male drivers as a feature
    }).reset_index()
    
    veh_agg.rename(columns={
        'Vehicle_Type': 'Primary_Vehicle_Type',
        'Age_of_Driver': 'Avg_Driver_Age',
        'Engine_Capacity_(CC)': 'Avg_Engine_CC',
        'Age_of_Vehicle': 'Avg_Vehicle_Age',
        'Sex_of_Driver': 'Male_Driver_Count'
    }, inplace=True)

    # 3. Aggregate Casualty Data (Many Casualties -> One Accident)
    cas_agg = cas.groupby('Accident_Index').agg({
        'Age_of_Casualty': 'mean',
        'Sex_of_Casualty': 'mean'
    }).reset_index()
    cas_agg.rename(columns={'Age_of_Casualty': 'Avg_Casualty_Age'}, inplace=True)

    # 4. Merge all into the main Accidents table
    df = acc.merge(veh_agg, on='Accident_Index', how='left')
    df = df.merge(cas_agg, on='Accident_Index', how='left')

    # 5. Handle Missing Values: Convert '-1' to NaN
    # XGBoost handles NaNs natively by learning the best default branch direction
    df.replace(-1, np.nan, inplace=True)

    # 6. Feature Engineering: Time & Date
    df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y', errors='coerce')
    df['Month'] = df['Date'].dt.month
    df['Hour'] = pd.to_datetime(df['Time'], format='%H:%M', errors='coerce').dt.hour
    
    # 7. Drop non-numeric / ID / Leaky columns
    # We drop 'Casualty_Severity' (if present) because it leaks the answer for Accident_Severity
    cols_to_drop = ['Accident_Index', 'LSOA_of_Accident_Location', 
                    'Local_Authority_(Highway)', 'Time', 'Date']
    df_final = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

    # 8. Target Alignment
    # XGBoost multiclass requires labels to start at 0 (Original is 1, 2, 3)
    if 'Accident_Severity' in df_final.columns:
        df_final['Accident_Severity'] = df_final['Accident_Severity'] - 1

    return df_final

# Execute processing
df_train = preprocess_for_xgboost('AccidentsBig.csv', 'CasualtiesBig.csv', 'VehiclesBig.csv')

# Note: You can now pass df_train directly to xgb.DMatrix
print(f"Dataset ready for XGBoost. Shape: {df_train.shape}")


  acc = pd.read_csv('C:\\Users\\VEDANTI\\OneDrive\\Desktop\\projects\\techathon\\archive (2)\\AccidentsBig.csv').dropna(subset=['Accident_Index'])


Dataset ready for XGBoost. Shape: (59998, 34)


In [4]:
df_train.head()

Unnamed: 0,longitude,latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Day_of_Week,Local_Authority_(District),1st_Road_Class,1st_Road_Number,...,Did_Police_Officer_Attend_Scene_of_Accident,Primary_Vehicle_Type,Avg_Driver_Age,Avg_Engine_CC,Avg_Vehicle_Age,Male_Driver_Count,Avg_Casualty_Age,Sex_of_Casualty,Month,Hour
0,78.610393,14.724026,1.0,1.0,1.0,1.0,3.0,12.0,3.0,3218.0,...,1.0,9.0,74.0,,,0,37.0,1.0,1,17.0
1,78.534042,14.762353,1.0,2.0,1.0,1.0,4.0,12.0,4.0,450.0,...,1.0,11.0,42.0,8268.0,3.0,1,37.0,1.0,1,17.0
2,78.470877,14.745606,1.0,2.0,2.0,1.0,5.0,12.0,5.0,0.0,...,1.0,11.0,35.0,8300.0,5.0,1,62.0,1.0,1,0.0
3,78.557994,14.667128,1.0,2.0,1.0,1.0,6.0,12.0,3.0,3220.0,...,1.0,9.0,62.0,1762.0,6.0,1,30.0,1.0,1,10.0
4,78.576431,14.703443,1.0,2.0,1.0,1.0,2.0,12.0,6.0,0.0,...,1.0,9.0,49.0,1769.0,4.0,0,49.0,1.0,1,21.0


In [5]:
!pip install xgboost




[notice] A new release of pip is available: 24.0 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [7]:
# Target column
y = df_train['Accident_Severity']

# Features
X = df_train.drop(columns=['Accident_Severity'])


In [8]:
X = pd.get_dummies(X, drop_first=True)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [10]:
model = XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='mlogloss',
    tree_method='hist',
    random_state=42
)


In [11]:
model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=True
)


[0]	validation_0-mlogloss:0.72184
[1]	validation_0-mlogloss:0.70213
[2]	validation_0-mlogloss:0.68371
[3]	validation_0-mlogloss:0.66676
[4]	validation_0-mlogloss:0.65075
[5]	validation_0-mlogloss:0.63595
[6]	validation_0-mlogloss:0.62182
[7]	validation_0-mlogloss:0.60862
[8]	validation_0-mlogloss:0.59655
[9]	validation_0-mlogloss:0.58513
[10]	validation_0-mlogloss:0.57428
[11]	validation_0-mlogloss:0.56408
[12]	validation_0-mlogloss:0.55448
[13]	validation_0-mlogloss:0.54561
[14]	validation_0-mlogloss:0.53734
[15]	validation_0-mlogloss:0.52960
[16]	validation_0-mlogloss:0.52226
[17]	validation_0-mlogloss:0.51539
[18]	validation_0-mlogloss:0.50884
[19]	validation_0-mlogloss:0.50264
[20]	validation_0-mlogloss:0.49682
[21]	validation_0-mlogloss:0.49141
[22]	validation_0-mlogloss:0.48633
[23]	validation_0-mlogloss:0.48150
[24]	validation_0-mlogloss:0.47698
[25]	validation_0-mlogloss:0.47287
[26]	validation_0-mlogloss:0.46880
[27]	validation_0-mlogloss:0.46504
[28]	validation_0-mlogloss:0.4

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'multi:softprob'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.8
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [12]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.8665833333333334

Classification Report:

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       129
         1.0       0.46      0.02      0.04      1476
         2.0       0.87      1.00      0.93     10395

    accuracy                           0.87     12000
   macro avg       0.44      0.34      0.32     12000
weighted avg       0.81      0.87      0.81     12000



In [13]:
print(confusion_matrix(y_test, y_pred))


[[    0     9   120]
 [    1    31  1444]
 [    0    27 10368]]


In [14]:
joblib.dump(model, "accident_risk_xgb_model.pkl")
joblib.dump(X.columns, "model_features.pkl")

['model_features.pkl']

In [15]:
!pip install "numpy<2.4"




[notice] A new release of pip is available: 24.0 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import shap
import pandas as pd
import numpy as np

explainer = shap.TreeExplainer(model)

sample = X_test.iloc[[0]]

# SHAP values
shap_values = explainer.shap_values(sample)

# Convert to numpy
shap_array = np.array(shap_values)

# Get predicted class
pred_class = model.predict(sample)[0]

# Select SHAP values for predicted class
shap_values_1d = np.abs(shap_array[0, :, pred_class])

print(len(X_test.columns), len(shap_values_1d))

# Create dataframe
shap_df = pd.DataFrame({
    "feature": X_test.columns,
    "impact": shap_values_1d
})

EXCLUDED_FEATURES=["lattitude", 'longitude']

top_factors = [
    f for f in shap_df.sort_values(
        by="impact",
        ascending=False
    )["feature"].tolist()
    if f not in EXCLUDED_FEATURES
][:5]

print(top_factors)


33 33
['Number_of_Vehicles', 'Junction_Detail', 'Number_of_Casualties', 'Local_Authority_(District)', 'Did_Police_Officer_Attend_Scene_of_Accident']


In [17]:
print("Columns:", len(X_test.columns))
print("SHAP shape:", np.array(shap_values).shape)


Columns: 33
SHAP shape: (1, 33, 3)


In [22]:

explainer = shap.TreeExplainer(model)


In [19]:
df_train.to_csv("model/processed_dataset.csv", index=False)