In [2]:
import pandas as pd
from arcgis.features import GeoAccessor

df = pd.read_parquet("../data/processed/paris_accidents_engineerd_3.parquet")

# Tell ArcGIS which column is the geometry
sdf = GeoAccessor.from_df(df, geometry_column="SHAPE")



In [3]:
# make sure gravite is numeric first
sdf['gravite'] = sdf['gravite'].astype(int)

# 0 stays 0; 1 and 2 become 1
sdf['gravite_bin'] = (sdf['gravite'] > 0).astype(int)

sdf['gravite_bin'].value_counts()

gravite_bin
0    38040
1     3171
Name: count, dtype: int64

In [4]:
sdf.head()

Unnamed: 0,id_accident,annee,age,sexe_victime,victime_type,categorie,gravite,milieu,longitude,latitude,SHAPE,gravite_bin
0,676640,2018,-0.102618,Feminin,4 Roues,Passager,0,En-Agg,2.30158,48.8918,"{""rings"": [[[2.303774362, 48.894153779], [2.30...",0
1,83423,2021,-1.201027,Masculin,2 Roues MotorisÃ©es,Conducteur,0,En-Agg,2.359433,48.819161,"{""rings"": [[[2.366087726, 48.844967843], [2.37...",0
2,683716,2018,1.227035,Masculin,PiÃ©ton,PiÃ©ton,0,En-Agg,2.3704,48.843,"{""rings"": [[[2.467319402, 48.839099389], [2.46...",0
3,684688,2018,-0.276051,Masculin,4 Roues,Conducteur,0,En-Agg,2.29774,48.8437,"{""rings"": [[[2.289407656, 48.828333842], [2.28...",0
4,686001,2018,-0.391673,Masculin,2 Roues MotorisÃ©es,Conducteur,0,En-Agg,2.2769,48.8459,"{""rings"": [[[2.289407656, 48.828333842], [2.28...",0


In [5]:
# Convert categorical columns to category dtype
categorical_cols = ['sexe_victime', 'victime_type', 'categorie', 'milieu']

print("Converting categorical columns to category dtype...")
for col in categorical_cols:
    if col in sdf.columns:
        sdf[col] = sdf[col].astype('category')
        print(f"  âœ“ {col}: {len(sdf[col].cat.categories)} unique categories")

print("\nColumn dtypes after conversion:")
print(sdf.dtypes)

Converting categorical columns to category dtype...
  âœ“ sexe_victime: 2 unique categories
  âœ“ victime_type: 6 unique categories
  âœ“ categorie: 3 unique categories
  âœ“ milieu: 2 unique categories

Column dtypes after conversion:
id_accident       object
annee              Int64
age              float64
sexe_victime    category
victime_type    category
categorie       category
gravite            int64
milieu          category
longitude        float64
latitude         float64
SHAPE           geometry
gravite_bin        int64
dtype: object


In [6]:
# Save the preprocessed data as version 4
output_path = '../data/processed/paris_accidents_engineerd_4.parquet'
sdf.to_parquet(output_path)

print("="*60)
print(f"âœ“ SAVED: {output_path}")
print("="*60)
print(f"Total records: {len(sdf):,}")
print(f"\ngravite_bin distribution:")
print(sdf['gravite_bin'].value_counts().sort_index())

âœ“ SAVED: ../data/processed/paris_accidents_engineerd_4.parquet
Total records: 41,211

gravite_bin distribution:
gravite_bin
0    38040
1     3171
Name: count, dtype: int64


In [4]:

# 2. Filter for Severe Accidents ONLY (gravite_bin == 1)
severe_accidents = sdf[sdf['gravite_bin'] == 1]

# 3. Calculate the proportion (percentage) for each gender
gender_proportions = severe_accidents['sexe_victime'].value_counts(normalize=True) * 100

print("ðŸ“Š Proportion of Severe Accidents by Gender:")
print("-" * 40)
print(gender_proportions)

ðŸ“Š Proportion of Severe Accidents by Gender:
----------------------------------------
sexe_victime
Masculin    74.676758
Feminin     25.323242
Name: proportion, dtype: float64
