In [2]:
import pandas as pd
from arcgis.features import GeoAccessor

df = pd.read_parquet("../data/processed/paris_accidents_engineerd_3.parquet")

# Tell ArcGIS which column is the geometry
sdf = GeoAccessor.from_df(df, geometry_column="SHAPE")

sdf.head()

Unnamed: 0,id_accident,annee,age,sexe_victime,victime_type,categorie,gravite,milieu,longitude,latitude,SHAPE
0,676640,2018,-0.102618,Feminin,4 Roues,Passager,0,En-Agg,2.30158,48.8918,"{""rings"": [[[2.303774362, 48.894153779], [2.30..."
1,83423,2021,-1.201027,Masculin,2 Roues Motorisées,Conducteur,0,En-Agg,2.359433,48.819161,"{""rings"": [[[2.366087726, 48.844967843], [2.37..."
2,683716,2018,1.227035,Masculin,Piéton,Piéton,0,En-Agg,2.3704,48.843,"{""rings"": [[[2.467319402, 48.839099389], [2.46..."
3,684688,2018,-0.276051,Masculin,4 Roues,Conducteur,0,En-Agg,2.29774,48.8437,"{""rings"": [[[2.289407656, 48.828333842], [2.28..."
4,686001,2018,-0.391673,Masculin,2 Roues Motorisées,Conducteur,0,En-Agg,2.2769,48.8459,"{""rings"": [[[2.289407656, 48.828333842], [2.28..."


In [3]:
# make sure gravite is numeric first
sdf['gravite'] = sdf['gravite'].astype(int)

# 0 stays 0; 1 and 2 become 1
sdf['gravite_bin'] = (sdf['gravite'] > 0).astype(int)

sdf['gravite_bin'].value_counts()


gravite_bin
0    38040
1     3171
Name: count, dtype: int64

In [4]:
sdf['gravite'] = sdf['gravite'].astype(str)


In [5]:
sdf.dtypes

id_accident       object
annee              Int64
age              float64
sexe_victime      object
victime_type      object
categorie         object
gravite           object
milieu            object
longitude        float64
latitude         float64
SHAPE           geometry
gravite_bin        int64
dtype: object

In [6]:
sdf['gravite_bin'].value_counts()

gravite_bin
0    38040
1     3171
Name: count, dtype: int64

In [7]:
# 1. Split majority / minority
non_severe = sdf[sdf['gravite_bin'] == 0]
severe     = sdf[sdf['gravite_bin'] == 1]

print(len(non_severe), len(severe))  # just to see the counts


38040 3171


In [8]:
import pandas as pd

n_major = len(non_severe)
n_minor = len(severe)

# 2. Oversample severe accidents up to the same size as non_severe
severe_oversampled = severe.sample(
    n=n_major,
    replace=True,
    random_state=42
)

# 3. Concatenate and shuffle
sdf_balanced = pd.concat([non_severe, severe_oversampled]) \
                 .sample(frac=1, random_state=42) \
                 .reset_index(drop=True)

# Check new class balance
print(sdf_balanced['gravite_bin'].value_counts())


gravite_bin
1    38040
0    38040
Name: count, dtype: int64


In [None]:
from arcgis.learn import prepare_tabulardata

# continuous + categorical
X = [
    'annee',
    'age',
    ('sexe_victime', True),   # categorical
    ('victime_type', True),   # categorical
    ('categorie', True),      # categorical
    ('milieu', True),         # categorical
    'longitude',
    'latitude'
]

data = prepare_tabulardata(
    input_features=sdf_balanced,          # your SEDF (Spatially Enabled DataFrame)
    variable_predict='gravite_bin',  # target column
    explanatory_variables=X,
)



  import pkg_resources
  return torch._C._cuda_getDeviceCount() > 0
invalid escape sequence '\s'
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [None]:
from arcgis.learn import FullyConnectedNetwork

fcn = FullyConnectedNetwork(data)      # deep NN with categorical embeddings


In [15]:
fcn.fit(epochs=30)


epoch,train_loss,valid_loss,time
0,0.678801,0.678603,00:07
1,0.664245,0.670799,00:06
2,0.667105,0.659794,00:12
3,0.657754,0.658575,00:10
4,0.664581,0.655271,00:10
5,0.655007,0.649909,00:10
6,0.650477,0.648797,00:07
7,0.646563,0.642382,00:07
8,0.644499,0.637502,00:09
9,0.631981,0.634842,00:11


In [16]:
from fastai.basic_data import DatasetType
import numpy as np

# preds: probabilities / logits
# y_true: true labels as tensor
preds, y_true = fcn.learn.get_preds(ds_type=DatasetType.Valid)

y_true = y_true.numpy()
y_pred = preds.argmax(dim=1).numpy()   # predicted class index


In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification report:\n", classification_report(y_true, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_true, y_pred))


Accuracy: 0.7239747634069401

Classification report:
               precision    recall  f1-score   support

           0       0.80      0.60      0.68      3792
           1       0.68      0.85      0.76      3816

    accuracy                           0.72      7608
   macro avg       0.74      0.72      0.72      7608
weighted avg       0.74      0.72      0.72      7608


Confusion matrix:
 [[2261 1531]
 [ 569 3247]]


In [77]:
import os

os.makedirs("../models", exist_ok=True)

# use your trained model object name (fcn_bal if that's the one)
fcn.save(
    "../models/accident_severity_model",
    compute_metrics=False   # <- important
)


WindowsPath('C:/Git Projects/RoadRisk AI Predicting Traffic Accident Hotspots with GeoAI/RoadRisk-AI-Predicting-Traffic-Accident-Hotspots-with-GeoAI/models/accident_severity_model')

In [78]:
df.head()

Unnamed: 0,id_accident,annee,age,sexe_victime,victime_type,categorie,gravite,milieu,longitude,latitude,SHAPE,gravite_bin
0,676640,2018,-0.102618,Feminin,4 Roues,Passager,0,En-Agg,2.30158,48.8918,"{""rings"": [[[2.303774362, 48.894153779], [2.30...",0
1,83423,2021,-1.201027,Masculin,2 Roues Motorisées,Conducteur,0,En-Agg,2.359433,48.819161,"{""rings"": [[[2.366087726, 48.844967843], [2.37...",0
2,683716,2018,1.227035,Masculin,Piéton,Piéton,0,En-Agg,2.3704,48.843,"{""rings"": [[[2.467319402, 48.839099389], [2.46...",0
3,684688,2018,-0.276051,Masculin,4 Roues,Conducteur,0,En-Agg,2.29774,48.8437,"{""rings"": [[[2.289407656, 48.828333842], [2.28...",0
4,686001,2018,-0.391673,Masculin,2 Roues Motorisées,Conducteur,0,En-Agg,2.2769,48.8459,"{""rings"": [[[2.289407656, 48.828333842], [2.28...",0


In [1]:
import pickle

print("Saving complete model package with encodings...")

# Get categorical encodings from the TRAINING data
cat_encodings = {}
for col in ['sexe_victime', 'victime_type', 'categorie', 'milieu']:
    # sdf_balanced is your training dataframe (adjust name if different)
    cat_encodings[col] = {
        cat: idx + 1  # +1 because fastai uses 0 for unknown
        for idx, cat in enumerate(sdf_balanced[col].cat.categories)
    }

# Create complete model package
model_package = {
    'pytorch_state_dict': fcn.learn.model.state_dict(),
    'categorical_variables': fcn._data._categorical_variables,
    'continuous_variables': fcn._data._continuous_variables,
    'category_encodings': cat_encodings,
}

# Save with pickle
with open('../models/accident_model_complete.pkl', 'wb') as f:
    pickle.dump(model_package, f)

print("✓ Saved complete model!")
print("\nEncodings:")
for col, enc in cat_encodings.items():
    print(f"  {col}: {enc}")

Saving complete model package with encodings...


NameError: name 'sdf_balanced' is not defined