In [1]:
import json
import os
import pathlib
import boto3
import pandas as pd
import pendulum
import requests
import xmlschema
import re
import logging

from pathlib import Path
from airflow import AirflowException
from botocore.exceptions import ClientError
from airflow.decorators import dag, task
from airflow.models import Variable
from kafka import KafkaProducer
from requests.exceptions import HTTPError


In [17]:
class Train:
    """
    Cette classe définit un train conformément au schéma de l'API Transilien.
    Elle est générique pour tout les trains de ce réseau.
    La ligne A et B étant en double exploitation SNCF/RATP, elle hérite de cette classe
    """

    def __init__(self, num: str, miss: str, date: str, mode: str, term: str, time_recorded: pendulum.DateTime,
                 etat: str = ""):
        self.num = num
        self.miss = miss
        self.date = date.replace('/', '-')
        self.mode = mode
        self.direction: str = ""
        self.term = int(term)
        self.time_recorded: pendulum.DateTime = time_recorded
        self.etat = etat

    def get_num(self):
        """
        Retourne le numéro de la mission du train
        """
        return self.num

    def get_date(self):
        """
        Retourne la date et l'heure du train de départ
        """
        return self.date

    def get_mode(self):
        """
        Retourne le mode de l'horraire de départ.
        R pour Réel
        T pour théorique
        """
        return self.mode

    def get_term(self):
        """
        Retourne l'identifiant du terminus du train
        """
        return self.term

    def get_miss(self):
        """
        Retourne la mission du train
        """
        return self.miss

    def get_etat(self):
        """
        Retourne l'état du train, s'il y existe
        """
        return self.etat

    def get_time_recorded(self):
        return self.time_recorded

    def get_train(self):
        """
        Retourne les caractéristiques d'un train
        """
        return self.time_recorded.to_datetime_string(), self.num, self.miss, self.date, self.mode, self.term, self.direction, self.etat

    def to_dict(self):
        """
        Convertie la class train en un dictionnaire
        """
        a = ["time_recorded", self.time_recorded.to_datetime_string(),
             "num", self.num,
             "miss", self.miss,
             "date", self.date,
             "mode", self.mode,
             "term", self.term,
             "direction", self.direction,
             "etat", self.etat]
        it = iter(a)
        return dict(zip(it, it))


class RERB(Train):
    """
    Définit un train de la Ligne B et par extension, la ligne A en raison de leur numéro de mission de type ABCD01
    """

    def __init__(self, num: str, miss: str, date: str, mode: str, term: str, time_recorded: pendulum.DateTime,
                 etat: str = ""):
        Train.__init__(self, num, miss, date, mode, term, time_recorded, etat)
        self.direction = int(int(self.get_num()[-2]) % 2)


class Station:
    def __init__(self, xml_df: dict, date: pendulum.DateTime):
        assert ("@gare" in xml_df and "train" in xml_df)
        size = len(xml_df['train'])
        d: list[RERB] = []
        _pattern = r'[A-Z]{4}[0-9]{2}'
        if size != 0:
            for i in xml_df['train']:
                if re.match(_pattern, i["num"]):
                    if "etat" in i:
                        t = RERB(i["num"], i["miss"], i["date"]["$"], i["date"]["@mode"], i["term"], date,
                                 i["etat"])
                    else:
                        t = RERB(i["num"], i["miss"], i["date"]["$"], i["date"]["@mode"], i["term"], date)
                    d.append(t)

        self.gare = int(xml_df["@gare"])
        self.train = d

    def get_station(self):
        """
        Retourne l'identifiant de la gare
        """
        return self.gare

    def get_train(self):
        """
        Retourne la liste des trains de la stations
        """
        return self.train

In [18]:
def grab_data(gares: list[str]) -> list[str]:
    """
    Grab the SNCF Transilien Opendata next departures of list 'gares' and stores into a lists of xml responses
    """
    payload = {}
    headers = {
        'Authorization': 'Basic ' + 'dG5odG4xMjY0Olc1UXpiM2Q4'
    }
    url = "https://api.transilien.com/gare/"
    response = []
    for u in gares:
        try:
            feed = requests.request("GET", url + u + '/depart/', headers=headers, data=payload)
            feed.raise_for_status()
        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')  # Python 3.6
            exit(0)
        except Exception as err:
            print(f'Other error occurred: {err}')  # Python 3.6
            exit(0)
        else:
            response.append(feed.content)
    return response

def xml_to_df(list_xml: list[str], xml_schema: str) -> list[dict]:
    """
    Méthode qui va prendre la liste des XML récuéré auprès de Transilien SNCF, renvoie une liste de dictionnaire.
    Les XML sont validés par un schéma, puis converties en dictionnaire.
    """
    df = []
    schema = xmlschema.XMLSchema(xml_schema)
    for l in list_xml:
        df.append(xmlschema.to_dict(l, schema=schema, preserve_root=False))
    return df

def df_to_station(df_list: list[dict], date: pendulum.DateTime) -> list[Station]:
    """
    Méthode qui va prendre une liste des dictionnaires pour le convertir en une liste Station.
    Chaque Station va comporter une liste de train. Cette liste de train représente la liste des
    prochains départs
    """
    stations = []
    for l in df_list:
        stations.append(Station(l, date))
    return stations

def stations_to_json(stations_list: list[Station]) -> list[list[dict]]:
    """
    Méthode qui va convertir chaque stations en liste de liste dictionnaire JSON
    """
    json_lists = []
    for s in stations_list:
        train_list = []
        for t in s.get_train():
            train_list.append(t.to_dict())
        json_lists.append(train_list)
    return json_lists

In [26]:
def save_df_upload_s3(data: list[list[dict]], gares: list[str], date: pendulum.DateTime):
    """
    Méthode qui va sauvegarder les données sur un CSV en local, en préparation de upload vers S3
    """
    s3 = boto3.client("s3")
    for i in range(len(data)):
        output_dir = Path('data/processed/'
                          + str(date.month)
                          + '/'
                          + str(date.day))
        output_file = 'data-reel-' \
                      + gares[i] \
                      + '.csv'
        path = Path(output_dir / output_file)
        s3_path = 'data/processed/' \
                  + str(date.month) + '/' \
                  + str(date.day)
        df = pd.DataFrame.from_records(jsons[i])
        if path.is_file():
            df.to_csv(path, index=False, mode='a', header=False)
        else:
            output_dir.mkdir(parents=True, exist_ok=True)
            df.to_csv(path, index=False, mode='a', header=True)
        try:
            s3.upload_file(str(path), 'sncf-rer-b', s3_path + '/' + output_file)
        except ClientError as e:
            print(logging.error(e))
            exit(-1)

In [5]:
def save_to_s3(gares: list[str], date: pendulum.DateTime):
    """
    Méthode qui va sauvegarder les données sur un bucket S3
    """

    s3 = boto3.client("s3")

    for i in range(len(gares)):
        # Bucket S3

        output_dir = Path(
            'data/processed/' + str(date.month) + '/' + str(date.day))
        output_file = 'data-reel-' + gares[i] + '.csv'
        path = Path(output_dir / output_file)
        s3_path = 'data/processed/' \
                  + str(date.month) + '/' \
                  + str(date.day) + '/'
        if path.is_file():
            try:
                s3.upload_file(str(path), 'sncf-rer-b', s3_path + '/' + output_file)
            except ClientError as e:
                print(logging.error(e))
                exit(-1)

    return True

In [7]:
xsd = """<?xml version="1.0" encoding="UTF-8"?>
        <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
            <xsd:element name="passages">
                <xsd:complexType mixed="true">
                    <xsd:sequence>
                        <xsd:element name="train" type="trainType" minOccurs="0" maxOccurs="unbounded"/>
                    </xsd:sequence>
                    <xsd:attribute name="gare" type="xsd:string" use="required"/>
                </xsd:complexType>
            </xsd:element>
            <xsd:complexType name="trainType">
                <xsd:all>
                    <xsd:element name="date">
                        <xsd:complexType>
                            <xsd:simpleContent>
                                <xsd:extension base="xsd:string">
                                    <xsd:attribute name="mode" use="required">
                                        <xsd:simpleType>
                                            <xsd:restriction base="xsd:string">
                                                <xsd:enumeration value="R"/>
                                                <xsd:enumeration value="T"/>
                                            </xsd:restriction>
                                        </xsd:simpleType>
                                    </xsd:attribute>
                                </xsd:extension>
                            </xsd:simpleContent>
                        </xsd:complexType>
                    </xsd:element>
                    <xsd:element name="num" maxOccurs="1" type="xsd:string"/>
                    <xsd:element name="miss" maxOccurs="1" type="xsd:string"/>
                    <xsd:element name="term" maxOccurs="1" minOccurs="0" type="xsd:string"/>
                    <xsd:element name="etat" maxOccurs="1" minOccurs="0">
                        <xsd:simpleType>
                            <xsd:restriction base="xsd:string">
                                <xsd:enumeration value="Retardé"/>
                                <xsd:enumeration value="Supprimé"/>
                            </xsd:restriction>
                        </xsd:simpleType>
                    </xsd:element>
                </xsd:all>
            </xsd:complexType>
        </xsd:schema>
"""

In [19]:
gares = [
    "87001479",  # Charles de Gaulles 2
    "87271460",  # Charles de Gaulles 1
    "87271486",  # Parc des expositions
    "87271452",  # Villepinte
    "87271445",  # Sevran Beaudottes

    "87271528",  # Mitry Clay
    "87271510",  # Villeparisis Mitry-le-Neuf
    "87271437",  # Vert Galant
    "87271429",  # Sevran Livry

    "87271411",  # Aulnay Sous bois
    "87271478",  # Le Blanc Mesnil
    "87271403",  # Drancy
    "87271395",  # Le Bourget
    "87271304",  # La Courneuve - Aubervilliers
    "87164798",  # La Plaine Stade-de-France
    "87271007"  # Paris Gare-du-Nord
]

datetime_obj: pendulum.DateTime = pendulum.now("Europe/Paris")

data = grab_data(gares)
xml = xml_to_df(data, xsd)
stations = df_to_station(xml, datetime_obj)
jsons = stations_to_json(stations)

In [27]:
pd.DataFrame.from_records(jsons[1])

Unnamed: 0,time_recorded,num,miss,date,mode,term,direction,etat
0,2022-06-24 10:06:43,KONI34,KONI,24-06-2022 10:08,R,87393579,1,Supprimé
1,2022-06-24 10:06:43,PITO32,PITO,24-06-2022 10:08,R,87758789,1,
2,2022-06-24 10:06:43,ECHO13,ECHO,24-06-2022 10:14,R,87001479,1,
3,2022-06-24 10:06:43,PITO38,PITO,24-06-2022 10:23,R,87758896,1,
4,2022-06-24 10:06:43,EPIR15,EPIR,24-06-2022 10:28,R,87001479,1,
5,2022-06-24 10:06:43,KONI40,KONI,24-06-2022 10:29,R,87393579,0,
6,2022-06-24 10:06:43,ECHO19,ECHO,24-06-2022 10:37,R,87001479,1,
7,2022-06-24 10:06:43,EPIR21,EPIR,24-06-2022 10:39,R,87001479,0,
8,2022-06-24 10:06:43,PITO44,PITO,24-06-2022 10:44,R,87758896,0,
9,2022-06-24 10:06:43,ECHO25,ECHO,24-06-2022 10:49,R,87758623,0,Supprimé


In [28]:
save_df_upload_s3(jsons, gares, datetime_obj)