In [1]:
import pandas as pd

In [2]:
carac = pd.read_csv("caract-2023.csv", sep=";", encoding="latin1")
usagers = pd.read_csv("usagers-2023.csv", sep=";", encoding="latin1")
vehicules = pd.read_csv("vehicules-2023.csv", sep=";", encoding="latin1")
carac.head()

Unnamed: 0,Num_Acc,jour,mois,an,hrmn,lum,dep,com,agg,int,atm,col,adr,lat,long
0,202300000001,7,5,2023,06:00,1,75,75101,2,4,2,7,RUE DE RIVOLI,4886638600,232347100
1,202300000002,7,5,2023,05:30,5,94,94080,2,1,3,6,Avenue de Paris,4884547782,242868146
2,202300000003,7,5,2023,20:50,1,94,94022,2,3,2,1,Avenue du GÃ©nÃ©ral Leclerc,4876240000,240655000
3,202300000004,6,5,2023,23:57,5,94,94078,2,1,3,5,Rue de Paris,4873248432,244687575
4,202300000005,7,5,2023,00:50,5,94,94068,2,2,3,3,56bis Avenue Raspail,4878581000,249217000


In [3]:
carac["date"] = pd.to_datetime(
    dict(year=carac["an"], month=carac["mois"], day=carac["jour"]),
    errors="coerce"
)

In [4]:
if "hrmn" in carac.columns:
        carac["heure"] = pd.to_datetime(
        carac["hrmn"].astype(str).str.zfill(4), format="%H%M", errors="coerce"
    ).dt.time
else:
    carac["heure"] = None


In [5]:
carac = carac[carac["date"].notna()]


In [6]:
conditions_dict = {
    1: "Temps normal", 2: "Pluie légère", 3: "Pluie forte", 4: "Neige",
    5: "Brouillard", 6: "Vent fort", 7: "Éblouissement", 8: "Temps couvert", 9: "Inconnu"
}


In [7]:
carac["conditions_meteo"] = carac["atm"].map(conditions_dict)

In [8]:
if "grav" in carac.columns:
    carac = carac[carac["grav"] >= 1]


In [9]:
merged = pd.merge(carac, usagers, on="Num_Acc", how="left")


In [10]:
final = pd.merge(merged, vehicules, on=["Num_Acc", "num_veh"], how="left")


In [11]:
print(carac[["an", "mois", "jour", "date"]].head())


     an  mois  jour       date
0  2023     5     7 2023-05-07
1  2023     5     7 2023-05-07
2  2023     5     7 2023-05-07
3  2023     5     6 2023-05-06
4  2023     5     7 2023-05-07


In [12]:
print("Dates manquantes :", carac["date"].isna().sum())


Dates manquantes : 0


In [13]:

print(carac["hrmn"].head())


0    06:00
1    05:30
2    20:50
3    23:57
4    00:50
Name: hrmn, dtype: object


In [14]:

print(carac[["atm", "conditions_meteo"]].drop_duplicates())




      atm conditions_meteo
0       2     Pluie légère
1       3      Pluie forte
8       8    Temps couvert
13      1     Temps normal
36      5       Brouillard
40      7    Éblouissement
457     9          Inconnu
1018    6        Vent fort
1548    4            Neige
6434   -1              NaN


In [15]:
print(final.head())
print(final.shape)


        Num_Acc  jour  mois    an   hrmn  lum dep    com  agg  int  ...  \
0  202300000001     7     5  2023  06:00    1  75  75101    2    4  ...   
1  202300000002     7     5  2023  05:30    5  94  94080    2    1  ...   
2  202300000002     7     5  2023  05:30    5  94  94080    2    1  ...   
3  202300000003     7     5  2023  20:50    1  94  94022    2    3  ...   
4  202300000003     7     5  2023  20:50    1  94  94022    2    3  ...   

   etatp  id_vehicule_y senc catv obs obsm choc manv motor occutc  
0     -1  155Â 680Â 557    1   30   0    0    5    1     1    NaN  
1     -1  155Â 680Â 556    2    7   0    1    1    1     1    NaN  
2      1  155Â 680Â 556    2    7   0    1    1    1     1    NaN  
3     -1  155Â 680Â 554    1    2   0    2    1   16     1    NaN  
4     -1  155Â 680Â 555    2    7   0    2    2   15     1    NaN  

[5 rows x 42 columns]
(125789, 42)


In [24]:
print("Nombre total d'accident different cette annee est de :",final["Num_Acc"].nunique())


Nombre total d'accident different cette annee est de : 54822


In [26]:
final["mois"] = final["date"].dt.month
final.groupby("mois")["Num_Acc"].nunique()


mois
1     4053
2     3682
3     3998
4     4162
5     4767
6     5452
7     4754
8     4121
9     5161
10    5389
11    4833
12    4450
Name: Num_Acc, dtype: int64

In [28]:
final.groupby("dep")["Num_Acc"].nunique().sort_values(ascending=False)


dep
75     4763
93     2650
92     2563
94     2204
13     2057
       ... 
90       37
978      30
986      10
977       7
975       2
Name: Num_Acc, Length: 107, dtype: int64

In [31]:
final["atm"].value_counts()
final["atm"].value_counts()


 1    98463
 2    14974
 8     5017
 3     3437
 7     2180
 5      509
 9      475
 6      432
 4      295
-1        7
Name: atm, dtype: int64

In [32]:
final["lum"].value_counts()


 1    82988
 5    19367
 3    13826
 2     8169
 4     1436
-1        3
Name: lum, dtype: int64

In [34]:
final["agg"].value_counts()


2    76061
1    49728
Name: agg, dtype: int64