In [2]:
import pandas as pd
from arcgis.features import GeoAccessor

df = pd.read_parquet("../data/processed/paris_accidents_engineerd_3.parquet")

# Tell ArcGIS which column is the geometry
sdf = GeoAccessor.from_df(df, geometry_column="SHAPE")

sdf.head()

Unnamed: 0,id_accident,annee,age,sexe_victime,victime_type,categorie,gravite,milieu,longitude,latitude,SHAPE
0,676640,2018,-0.102618,Feminin,4 Roues,Passager,0,En-Agg,2.30158,48.8918,"{""rings"": [[[2.303774362, 48.894153779], [2.30..."
1,83423,2021,-1.201027,Masculin,2 Roues Motorisées,Conducteur,0,En-Agg,2.359433,48.819161,"{""rings"": [[[2.366087726, 48.844967843], [2.37..."
2,683716,2018,1.227035,Masculin,Piéton,Piéton,0,En-Agg,2.3704,48.843,"{""rings"": [[[2.467319402, 48.839099389], [2.46..."
3,684688,2018,-0.276051,Masculin,4 Roues,Conducteur,0,En-Agg,2.29774,48.8437,"{""rings"": [[[2.289407656, 48.828333842], [2.28..."
4,686001,2018,-0.391673,Masculin,2 Roues Motorisées,Conducteur,0,En-Agg,2.2769,48.8459,"{""rings"": [[[2.289407656, 48.828333842], [2.28..."


In [3]:
# make sure gravite is numeric first
sdf['gravite'] = sdf['gravite'].astype(int)

# 0 stays 0; 1 and 2 become 1
sdf['gravite_bin'] = (sdf['gravite'] > 0).astype(int)

sdf['gravite_bin'].value_counts()


gravite_bin
0    38040
1     3171
Name: count, dtype: int64

In [4]:
sdf['gravite'] = sdf['gravite'].astype(str)


In [5]:
sdf.dtypes

id_accident       object
annee              Int64
age              float64
sexe_victime      object
victime_type      object
categorie         object
gravite           object
milieu            object
longitude        float64
latitude         float64
SHAPE           geometry
gravite_bin        int64
dtype: object

In [6]:
sdf['gravite_bin'].value_counts()

gravite_bin
0    38040
1     3171
Name: count, dtype: int64

In [7]:
# 1. Split majority / minority
non_severe = sdf[sdf['gravite_bin'] == 0]
severe     = sdf[sdf['gravite_bin'] == 1]

print(len(non_severe), len(severe))  # just to see the counts


38040 3171


In [8]:
import pandas as pd

n_major = len(non_severe)
n_minor = len(severe)

# 2. Oversample severe accidents up to the same size as non_severe
severe_oversampled = severe.sample(
    n=n_major,
    replace=True,
    random_state=42
)

# 3. Concatenate and shuffle
sdf_balanced = pd.concat([non_severe, severe_oversampled]) \
                 .sample(frac=1, random_state=42) \
                 .reset_index(drop=True)

# Check new class balance
print(sdf_balanced['gravite_bin'].value_counts())


gravite_bin
1    38040
0    38040
Name: count, dtype: int64


In [9]:
from arcgis.learn import prepare_tabulardata

# continuous + categorical
X = [
    'annee',
    'age',
    ('sexe_victime', True),   # categorical
    ('victime_type', True),   # categorical
    ('categorie', True),      # categorical
    ('milieu', True),         # categorical
    'longitude',
    'latitude'
]

data = prepare_tabulardata(
    input_features=sdf_balanced,          # your SEDF (Spatially Enabled DataFrame)
    variable_predict='gravite_bin',  # target column
    explanatory_variables=X,
)



  import pkg_resources
  return torch._C._cuda_getDeviceCount() > 0
invalid escape sequence '\s'
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [11]:
from arcgis.learn import FullyConnectedNetwork

fcn = FullyConnectedNetwork(data)      # deep NN with categorical embeddings


In [12]:
fcn.fit(epochs=30)


epoch,train_loss,valid_loss,time
0,0.686464,0.687477,00:06
1,0.681704,0.682588,00:10
2,0.66421,0.667895,00:10
3,0.664288,0.662935,00:09
4,0.657291,0.662967,00:06
5,0.658839,0.659736,00:07
6,0.65536,0.659519,00:10
7,0.655136,0.656827,00:10
8,0.653912,0.65399,00:08
9,0.654422,0.651665,00:06


In [13]:
from fastai.basic_data import DatasetType
import numpy as np

# preds: probabilities / logits
# y_true: true labels as tensor
preds, y_true = fcn.learn.get_preds(ds_type=DatasetType.Valid)

y_true = y_true.numpy()
y_pred = preds.argmax(dim=1).numpy()   # predicted class index


In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification report:\n", classification_report(y_true, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_true, y_pred))


Accuracy: 0.6585173501577287

Classification report:
               precision    recall  f1-score   support

           0       0.69      0.57      0.62      3792
           1       0.64      0.75      0.69      3816

    accuracy                           0.66      7608
   macro avg       0.66      0.66      0.66      7608
weighted avg       0.66      0.66      0.66      7608


Confusion matrix:
 [[2148 1644]
 [ 954 2862]]


In [77]:
import os

os.makedirs("../models", exist_ok=True)

# use your trained model object name (fcn_bal if that's the one)
fcn.save(
    "../models/accident_severity_model",
    compute_metrics=False   # <- important
)


WindowsPath('C:/Git Projects/RoadRisk AI Predicting Traffic Accident Hotspots with GeoAI/RoadRisk-AI-Predicting-Traffic-Accident-Hotspots-with-GeoAI/models/accident_severity_model')

In [78]:
df.head()

Unnamed: 0,id_accident,annee,age,sexe_victime,victime_type,categorie,gravite,milieu,longitude,latitude,SHAPE,gravite_bin
0,676640,2018,-0.102618,Feminin,4 Roues,Passager,0,En-Agg,2.30158,48.8918,"{""rings"": [[[2.303774362, 48.894153779], [2.30...",0
1,83423,2021,-1.201027,Masculin,2 Roues Motorisées,Conducteur,0,En-Agg,2.359433,48.819161,"{""rings"": [[[2.366087726, 48.844967843], [2.37...",0
2,683716,2018,1.227035,Masculin,Piéton,Piéton,0,En-Agg,2.3704,48.843,"{""rings"": [[[2.467319402, 48.839099389], [2.46...",0
3,684688,2018,-0.276051,Masculin,4 Roues,Conducteur,0,En-Agg,2.29774,48.8437,"{""rings"": [[[2.289407656, 48.828333842], [2.28...",0
4,686001,2018,-0.391673,Masculin,2 Roues Motorisées,Conducteur,0,En-Agg,2.2769,48.8459,"{""rings"": [[[2.289407656, 48.828333842], [2.28...",0


In [20]:
# Check if xgboost is installed
try:
    import xgboost as xgb
    print(f"✓ XGBoost version: {xgb.__version__}")
except ImportError:
    print("Installing XGBoost...")
    !pip install xgboost
    import xgboost as xgb

✓ XGBoost version: 3.0.1


In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Use your balanced dataset
X = sdf_balanced[['annee', 'age', 'sexe_victime', 'victime_type', 
                  'categorie', 'milieu', 'longitude', 'latitude']].copy()
y = sdf_balanced['gravite_bin'].copy()

# Convert categorical columns to 'category' dtype for XGBoost
cat_columns = ['sexe_victime', 'victime_type', 'categorie', 'milieu']
for col in cat_columns:
    X[col] = X[col].astype('category')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nClass distribution in training:")
print(y_train.value_counts())

Training set: 60864 samples
Test set: 15216 samples

Class distribution in training:
gravite_bin
0    30432
1    30432
Name: count, dtype: int64


In [22]:
import xgboost as xgb

# Create XGBoost classifier
model = xgb.XGBClassifier(
    max_depth=6,
    learning_rate=0.1,
    n_estimators=200,
    objective='binary:logistic',
    eval_metric='logloss',
    enable_categorical=True,  # Handle categorical features natively
    random_state=42
)

# Train the model
print("Training XGBoost model...")
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=10  # Print every 10 iterations
)

print("\n✓ Training complete!")

Training XGBoost model...
[0]	validation_0-logloss:0.68614
[10]	validation_0-logloss:0.65027
[20]	validation_0-logloss:0.63202
[30]	validation_0-logloss:0.61817
[40]	validation_0-logloss:0.60822
[50]	validation_0-logloss:0.59915
[60]	validation_0-logloss:0.59200
[70]	validation_0-logloss:0.58505
[80]	validation_0-logloss:0.57772
[90]	validation_0-logloss:0.57034
[100]	validation_0-logloss:0.56441
[110]	validation_0-logloss:0.56102
[120]	validation_0-logloss:0.55648
[130]	validation_0-logloss:0.55070
[140]	validation_0-logloss:0.54425
[150]	validation_0-logloss:0.53785
[160]	validation_0-logloss:0.53178
[170]	validation_0-logloss:0.52672
[180]	validation_0-logloss:0.52208
[190]	validation_0-logloss:0.51688
[199]	validation_0-logloss:0.51293

✓ Training complete!


In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Make predictions
y_pred = model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, 
                          target_names=['Minor', 'Serious/Fatal']))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7698

Classification Report:
               precision    recall  f1-score   support

        Minor       0.80      0.72      0.76      7608
Serious/Fatal       0.74      0.82      0.78      7608

     accuracy                           0.77     15216
    macro avg       0.77      0.77      0.77     15216
 weighted avg       0.77      0.77      0.77     15216


Confusion Matrix:
[[5457 2151]
 [1351 6257]]


In [24]:
import pickle

# Create complete model package with ALL metadata
model_package = {
    'model': model,  # The trained XGBoost model
    'feature_names': X_train.columns.tolist(),
    'categorical_columns': cat_columns,
    'continuous_columns': ['annee', 'age', 'longitude', 'latitude'],
    'categorical_mappings': {
        col: X_train[col].cat.categories.tolist()
        for col in cat_columns
    },
    'model_type': 'XGBoost',
    'accuracy': accuracy
}

# Save with pickle
with open('../models/xgboost_accident_model.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("✓ Saved complete XGBoost model package!")
print(f"  Location: models/xgboost_accident_model.pkl")
print(f"  Test Accuracy: {accuracy:.2%}")

✓ Saved complete XGBoost model package!
  Location: models/xgboost_accident_model.pkl
  Test Accuracy: 76.98%
