# Exercice : Analyse des Retards des Trains SNCF (Données Mensuelles par Lignes) #
Objectif : Analyser les retards des trains à partir des données agrégées mensuellement par ligne ferroviaire et identifier les causes principales des perturbations.

In [2]:
import pandas as pd
import requests
import numpy as np
import datetime
import urllib.parse
import matplotlib.pyplot as plt

## Récupéation du dataframe via l'API ##

In [3]:
# Initialisation du dataframe 
df = pd.DataFrame()

# Paramètres de l'API
limit = 100
offset = 0
max_requests = 500

# Création de la boucle :
request_count = 0


while request_count < max_requests :
    url = f"https://ressources.data.sncf.com/api/explore/v2.1/catalog/datasets/regularite-mensuelle-tgv-aqst/records?order_by=date%20&limit={limit}&offset={offset}&refine=date%3A%222023%22"

    # Requête à l'API
    response = requests.get(url)
    data = response.json()

    # Vérification si des données sont retournées
    if "results" not in data or not data["results"]:
        break  # On sort de la boucle si plus de données disponibles

    # Création d'un DataFrame temporaire
    df_temp = pd.DataFrame(data["results"])

    df = pd.concat([df, df_temp], ignore_index=True)

    # Mise à jour de l'offset et compteur de requêtes
    offset += limit
    request_count += 1

df.columns


Index(['date', 'service', 'gare_depart', 'gare_arrivee', 'duree_moyenne',
       'nb_train_prevu', 'nb_annulation', 'commentaire_annulation',
       'nb_train_depart_retard', 'retard_moyen_depart',
       'retard_moyen_tous_trains_depart', 'commentaire_retards_depart',
       'nb_train_retard_arrivee', 'retard_moyen_arrivee',
       'retard_moyen_tous_trains_arrivee', 'commentaires_retard_arrivee',
       'nb_train_retard_sup_15', 'retard_moyen_trains_retard_sup15',
       'nb_train_retard_sup_30', 'nb_train_retard_sup_60',
       'prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic',
       'prct_cause_materiel_roulant', 'prct_cause_gestion_gare',
       'prct_cause_prise_en_charge_voyageurs'],
      dtype='object')

In [4]:
# Nettoyage du DataFrame

df.drop(columns= ["commentaire_annulation","commentaire_retards_depart", "commentaires_retard_arrivee"])

Unnamed: 0,date,service,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,nb_train_depart_retard,retard_moyen_depart,retard_moyen_tous_trains_depart,...,nb_train_retard_sup_15,retard_moyen_trains_retard_sup15,nb_train_retard_sup_30,nb_train_retard_sup_60,prct_cause_externe,prct_cause_infra,prct_cause_gestion_trafic,prct_cause_materiel_roulant,prct_cause_gestion_gare,prct_cause_prise_en_charge_voyageurs
0,2023-01,International,PARIS LYON,LAUSANNE,234,184,9,30,14.786667,2.198381,...,16,44.717708,6,4,12.500000,25.000000,12.500000,18.750000,18.750000,12.500000
1,2023-01,International,STUTTGART,PARIS EST,194,145,14,113,6.216372,4.859561,...,18,37.319444,9,2,11.111111,11.111111,55.555556,5.555556,16.666667,0.000000
2,2023-01,National,AIX EN PROVENCE TGV,PARIS LYON,189,433,16,276,6.640278,4.496755,...,40,51.522083,25,11,35.000000,20.000000,20.000000,10.000000,10.000000,5.000000
3,2023-01,National,ANGOULEME,PARIS MONTPARNASSE,125,334,20,162,8.697531,4.775474,...,32,54.828646,17,11,27.027027,5.405405,16.216216,13.513514,10.810811,27.027027
4,2023-01,National,AVIGNON TGV,PARIS LYON,166,433,17,188,6.515691,2.873638,...,38,42.589912,24,7,28.070175,15.789474,36.842105,7.017544,5.263158,7.017544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1441,2023-12,National,PARIS LYON,VALENCE ALIXAN TGV,130,425,0,38,13.823684,0.898196,...,29,31.864368,13,2,6.382979,21.276596,25.531915,19.148936,4.255319,23.404255
1442,2023-12,National,PARIS MONTPARNASSE,ANGOULEME,123,309,10,40,20.002500,2.316388,...,33,38.055051,16,6,28.888889,11.111111,22.222222,13.333333,24.444444,0.000000
1443,2023-12,National,ST PIERRE DES CORPS,PARIS MONTPARNASSE,62,523,17,209,8.559011,3.689229,...,38,31.957456,12,3,30.000000,30.000000,21.250000,12.500000,2.500000,3.750000
1444,2023-12,National,STRASBOURG,PARIS EST,114,676,6,346,11.083381,5.801849,...,89,30.883146,28,6,5.263158,0.751880,54.887218,19.548872,11.278195,8.270677


In [5]:
# Le nombre total de trains ayant subi un retard ne doit pas dépasser le nombre total de trains prévus (nb_train_prevu).
print(((df.drop(columns= ["commentaire_annulation","commentaire_retards_depart", "commentaires_retard_arrivee"])
.assign(Vérif = df["nb_annulation"] + df["nb_train_retard_sup_15"] + df["nb_train_retard_sup_30"] + df["nb_train_retard_sup_60"] - df["nb_train_prevu"])
)["Vérif"] > 0 ).value_counts())
print("\n On a que des FALSE donc c'est bon, on a pas plus de retards que de trains prévus ")

Vérif
False    1446
Name: count, dtype: int64

 On a que des FALSE donc c'est bon, on a pas plus de retards que de trains prévus 


### A. Analyse des Retards

**Distribution des retards :**

- Visualise la répartition des retards moyens au départ et à l’arrivée.
- Compare les retards des trains partis en retard (retard_moyen_depart) avec la moyenne de tous les trains (retard_moyen_tous_trains_depart).
- Analyse la proportion de trains ayant eu un retard supérieur à 15, 30 et 60 minutes.


In [55]:
(df.drop(columns=["commentaire_annulation", "commentaire_retards_depart", "commentaires_retard_arrivee",'prct_cause_externe', 'prct_cause_infra', 'prct_cause_gestion_trafic',
       'prct_cause_materiel_roulant', 'prct_cause_gestion_gare',
       'prct_cause_prise_en_charge_voyageurs'])
 .groupby(['gare_depart','gare_arrivee'])
 .agg({
    'duree_moyenne': 'mean',
    'nb_train_prevu': 'sum',
    'nb_annulation': 'sum',
    'nb_train_depart_retard': 'sum',
    'retard_moyen_depart': 'mean',
    'retard_moyen_tous_trains_depart': 'mean',
    'nb_train_retard_arrivee': 'sum',
    'retard_moyen_arrivee': 'mean',
    'retard_moyen_tous_trains_arrivee': 'mean',
    'nb_train_retard_sup_15': 'sum',
    'retard_moyen_trains_retard_sup15': 'mean',
    'nb_train_retard_sup_30': 'sum',
    'nb_train_retard_sup_60': 'sum'
}).reset_index()
.assign(prc_retard = (df["nb_annulation"] + df["nb_train_retard_sup_15"] + df["nb_train_retard_sup_30"] + df["nb_train_retard_sup_60"])*100 / df["nb_train_prevu"])
.sort_values("prc_retard", ascending=False)
)



Unnamed: 0,gare_depart,gare_arrivee,duree_moyenne,nb_train_prevu,nb_annulation,nb_train_depart_retard,retard_moyen_depart,retard_moyen_tous_trains_depart,nb_train_retard_arrivee,retard_moyen_arrivee,retard_moyen_tous_trains_arrivee,nb_train_retard_sup_15,retard_moyen_trains_retard_sup15,nb_train_retard_sup_30,nb_train_retard_sup_60,prc_retard
46,NANCY,PARIS EST,95.916667,3350,65,778,10.745979,2.365419,352,31.431220,3.784798,232,41.418378,119,37,52.666667
22,LAVAL,PARIS MONTPARNASSE,81.750000,3526,69,1152,12.308991,4.045386,629,33.679255,5.397516,309,54.028936,166,81,50.000000
30,LYON PART DIEU,MARSEILLE ST CHARLES,105.416667,5749,225,3088,12.399498,7.045982,1225,36.072237,9.440121,966,42.049938,433,174,39.000000
8,BESANCON FRANCHE COMTE TGV,PARIS LYON,133.833333,2070,71,550,9.178160,2.263000,190,44.991675,5.019611,137,55.355888,77,50,38.775510
44,MONTPELLIER,PARIS LYON,210.000000,2943,53,1173,10.711284,4.332756,486,45.090814,9.190937,486,45.090814,242,101,38.150289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,PARIS NORD,DOUAI,70.166667,1590,67,152,15.033107,1.196458,224,23.067456,4.346178,97,42.727750,46,14,10.344828
120,ZURICH,PARIS LYON,248.666667,1905,52,623,1.705202,1.038520,150,46.505452,5.501793,150,46.505452,73,37,10.204082
82,PARIS MONTPARNASSE,ANGERS SAINT LAUD,93.416667,6233,205,814,17.812692,2.117745,808,31.396417,4.759245,460,48.115244,237,91,9.134615
20,LA ROCHELLE VILLE,PARIS MONTPARNASSE,172.000000,2552,161,212,14.084499,1.306214,301,37.616127,5.568594,225,45.177854,97,40,7.216495
