This notebook is maded for exploration purposes of the GTFS Dataset.
It is also used to moount an Data Model 

In [1]:
import pandas as pd

In [2]:
def import_gtfs(gtfs_path, busiest_date = True):
    import warnings
    warnings.filterwarnings("ignore")
    import os
    import pandas as pd
    import zipfile

    try:
        import partridge as ptg 
    except ImportError as e:
        os.system('pip install partridge')
        import partridge as ptg

    try:
        import geopandas as gpd
    except ImportError as e:
        os.system('pip install geopandas')
        import geopandas as gpd
    # Partridge to read the feed
    # service_ids = pd.read_csv(gtfs_path + '/trips.txt')['service_id'].unique()
    # service_ids = frozenset(tuple(service_ids))
        
    if busiest_date:
        service_ids = ptg.read_busiest_date(gtfs_path)[1]
    else:
        with zipfile.ZipFile(gtfs_path) as myzip:
            myzip.extract("trips.txt")
        service_ids = pd.read_csv('trips.txt')['service_id'].unique()
        service_ids = frozenset(tuple(service_ids))
        os.remove('trips.txt')
        
    view = {'trips.txt': {'service_id': service_ids}}
    
    feed = ptg.load_geo_feed(gtfs_path, view)
    
    routes = feed.routes
    trips = feed.trips
    stop_times = feed.stop_times
    stops = feed.stops
    shapes = feed.shapes
    
    # Get routes info in trips
    # The GTFS feed might be missing some of the keys, e.g. direction_id or shape_id.
    # To allow processing incomplete GTFS data, we must reindex instead:
    # https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
    # This will add NaN for any missing columns.
    trips = pd.merge(trips, routes, how='left').reindex(columns=['trip_id', 'route_id',
                                                        'service_id','trip_headsign', 'trip_short_name', 'direction_id','shape_id'])
#    ["route_id","service_id","trip_id","trip_headsign","trip_short_name","direction_id","block_id","shape_id","wheelchair_accessible","bikes_allowed"]
    
    
    # Get trips, routes and stops info in stop_times
    stop_times = pd.merge(stop_times, trips, how='left') 
    stop_times = pd.merge(stop_times, stops, how='left')
    # stop_times needs to be geodataframe if we want to do geometry operations
    stop_times = gpd.GeoDataFrame(stop_times, geometry='geometry')
    
    return routes, stops, stop_times, trips, shapes

In [3]:
# Import GTFS
gtfs_path = "../data/external/transilien-gtfs.zip"

In [4]:
routes, stops, stop_times, trips, shapes = import_gtfs(gtfs_path)

In [5]:
trips

Unnamed: 0,trip_id,route_id,service_id,trip_headsign,trip_short_name,direction_id,shape_id
0,IDFM:TN:SNCF:52c63590-6112-44af-8e30-a50676e0291f,IDFM:C01727,IDFM:TN:00c39053-c1be-30e2-ad5c-7c7ee9a80ffb,ROMI,148101,0,
1,IDFM:TN:SNCF:58c81c89-786e-4786-a777-634c9b9dfd1c,IDFM:C01727,IDFM:TN:00c39053-c1be-30e2-ad5c-7c7ee9a80ffb,ZORA,147100,0,
2,IDFM:TN:SNCF:069613e8-f76b-4a90-aef3-c3ca04399a91,IDFM:C01727,IDFM:TN:00c39053-c1be-30e2-ad5c-7c7ee9a80ffb,JILL,147213,0,
3,IDFM:TN:SNCF:dfcd1c09-5d46-40d0-9113-4b336019b401,IDFM:C01727,IDFM:TN:00c39053-c1be-30e2-ad5c-7c7ee9a80ffb,ROMI,140189,0,
4,IDFM:TN:SNCF:52754d1f-f621-465d-a6f2-557421393d83,IDFM:C01727,IDFM:TN:00c39053-c1be-30e2-ad5c-7c7ee9a80ffb,ZORA,143546,0,
...,...,...,...,...,...,...,...
7157,IDFM:TN:SNCF:5a2e232d-c7e0-4f84-b569-b377c386a621,IDFM:C01741,IDFM:TN:9062,VERI,164303,0,
7158,IDFM:TN:SNCF:abf7198b-2123-4b7e-8dc4-ff25e1938560,IDFM:C01741,IDFM:TN:9096,DEFI,164350,0,
7159,IDFM:TN:SNCF:a6bbbe4e-6c25-4923-893d-446c196f35a0,IDFM:C01727,IDFM:TN:dd4edddc-bd73-346a-bde7-9d1aedaee37f,ROMI,140183,0,
7160,IDFM:TN:SNCF:10d16abb-d2c4-4159-92cd-4c2e6c1fab65,IDFM:C01851,IDFM:TN:ea9dc28e-9a32-35d3-a46e-d3e3154a0eba,Paris Austerlitz,7917002,0,


In [6]:
routes_line = "IDFM:C01743"

In [7]:
rer_b_trips = trips[trips.route_id == routes_line]

In [8]:
rer_b_trips = rer_b_trips.reset_index(drop=True)
rer_b_trips[rer_b_trips.trip_id == "IDFM:TN:SNCF:4794d582-f8b6-4f86-b86c-64df751bc3c7"]

Unnamed: 0,trip_id,route_id,service_id,trip_headsign,trip_short_name,direction_id,shape_id
131,IDFM:TN:SNCF:4794d582-f8b6-4f86-b86c-64df751bc3c7,IDFM:C01743,IDFM:TN:14244,EPAF,EPOI81,0,


In [9]:
pd.set_option('display.max_columns', None)

In [10]:
df = stop_times.merge(rer_b_trips, on=['trip_id', 'route_id', 'service_id', 'trip_headsign', 'trip_short_name', 'direction_id', 'shape_id'])

In [11]:
df["temps_darret"] = df["departure_time"] - df["arrival_time"]

In [12]:
a = df["trip_id"].unique()

In [13]:
d = 0
for trip in a:
    if d == 500:
        b = df[df.trip_id==trip]
        break
    d = d + 1

In [14]:
b

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,local_zone_id,stop_headsign,timepoint,route_id,service_id,trip_headsign,trip_short_name,direction_id,shape_id,stop_code,stop_name,stop_desc,zone_id,stop_url,location_type,parent_station,stop_timezone,level_id,wheelchair_boarding,platform_code,geometry,temps_darret
13083,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,53459.0,53459.0,IDFM:monomodalStopPlace:58774,0,0,1,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,Massy - Palaiseau,,4,,0,IDFM:63244,,,0,,POINT (2.25877 48.72552),0.0
13084,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,53564.0,53584.0,IDFM:monomodalStopPlace:47940,1,0,0,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,Massy - Verrières,,4,,0,IDFM:63320,,,0,,POINT (2.27395 48.73499),20.0
13085,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,53672.0,53692.0,IDFM:monomodalStopPlace:43228,2,0,0,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,Les Baconnets,,4,,0,IDFM:69622,,,0,,POINT (2.28811 48.73985),20.0
13086,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,53757.0,53777.0,IDFM:monomodalStopPlace:43124,3,0,0,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,Fontaine Michalon,,4,,0,IDFM:69647,,,0,,POINT (2.29596 48.74305),20.0
13087,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,53873.0,53913.0,IDFM:monomodalStopPlace:43066,4,0,0,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,Antony,,3,,0,IDFM:69759,,,0,,POINT (2.30089 48.75463),40.0
13088,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,53984.0,54014.0,IDFM:monomodalStopPlace:46007,5,0,0,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,La Croix de Berny,,3,,0,IDFM:69813,,,0,,POINT (2.30421 48.76172),30.0
13089,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,54092.0,54112.0,IDFM:monomodalStopPlace:43177,6,0,0,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,Parc de Sceaux,,3,,0,IDFM:69906,,,0,,POINT (2.30993 48.76972),20.0
13090,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,54201.0,54251.0,IDFM:monomodalStopPlace:43097,7,0,0,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,Bourg-la-Reine,,3,,0,IDFM:70033,,,0,,POINT (2.31228 48.78026),50.0
13091,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,54370.0,54400.0,IDFM:monomodalStopPlace:44493,8,0,0,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,Bagneux,,3,,0,IDFM:70210,,,0,,POINT (2.32201 48.79334),30.0
13092,IDFM:TN:SNCF:1c49e953-0d64-4dbe-90b1-2a7abc0d6243,54470.0,54500.0,IDFM:monomodalStopPlace:43067,9,0,0,,,1,IDFM:C01743,IDFM:TN:32272,ERIC,ERIO79,0,,,Arcueil - Cachan,,3,,0,IDFM:70263,,,0,,POINT (2.32809 48.79870),30.0


In [15]:
relation = pd.read_csv("../data/processed/relation_ordre_RER_B.csv", sep=';', names=["Station_name", "Station_Order"])
d = relation.set_index('Station_name').T.to_dict('index')
d = d["Station_Order"]

In [16]:
def find_direction(relation: dict, d1: str, d2:str) -> int:
    if (relation[d1] < relation[d2]):
        return 0 # Sud
    else:
        return 1 # Nord

In [17]:
column_names = ["trip_id", "trip_line", "trip_headsign", "trip_short_name", "destination", "destination_id" ,"origin", "origin_id", "time_departure", "arrival", "arrival_id", "time_arrival", "time_travelled"] # Time difference
cleaned_df = pd.DataFrame(columns = column_names)
import time

In [18]:
cleaned_df

Unnamed: 0,trip_id,trip_line,trip_headsign,trip_short_name,destination,destination_id,origin,origin_id,time_departure,arrival,arrival_id,time_arrival,time_travelled


In [19]:
a = df["trip_id"].unique()
start_time = time.time()
current = 0
for journeys in a:
    test = df[df.trip_id==journeys].reset_index(drop=True)
    for i in range(1, len(test)):
        if i == 1:
            start = test.loc[i-1, 'stop_name']
            destination = test.loc[len(test)-1, 'stop_name']
            destination_id = test.loc[len(test)-1, 'parent_station']
            direction = find_direction(d, test.loc[i-1, 'stop_name'], destination)
            
        trip_id = test.loc[i, 'trip_id']
        trip_line = test.loc[i,'route_id']
        trip_headsign = test.loc[i-1, "trip_headsign"]
        trip_short_name =test.loc[i-1, "trip_short_name"]
        origin = test.loc[i-1, 'stop_name']
        origin_id = test.loc[i-1, 'parent_station']
        time_departure = test.loc[i-1, 'departure_time']
        
        arrival = test.loc[i, 'stop_name']
        arrival_id = test.loc[i, 'parent_station']
        time_arrival = test.loc[i, 'arrival_time']
        time_travelled = time_arrival - time_departure

        cleaned_df = cleaned_df.append({
            "trip_id": trip_id, 
            "trip_line":trip_line,
            "trip_headsign": trip_headsign,
            "trip_short_name": trip_short_name,
            "destination": destination,
            "destination_id": destination_id,
            "origin": origin,
            "origin_id": origin_id,
            "time_departure":time_departure,
            "arrival": arrival,
            "arrival_id": arrival_id,
            "time_arrival": time_arrival,
            "time_travelled":time_travelled,
            "direction":direction},
            ignore_index=True)

print("--- %s seconds ---" % (time.time() - start_time))

--- 44.016666889190674 seconds ---


In [20]:
cleaned_df.tail(10)

Unnamed: 0,trip_id,trip_line,trip_headsign,trip_short_name,destination,destination_id,origin,origin_id,time_departure,arrival,arrival_id,time_arrival,time_travelled,direction
13824,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,Massy - Palaiseau,IDFM:63244,80212.0,Palaiseau,IDFM:63175,80314.0,102.0,0.0
13825,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,Palaiseau,IDFM:63175,80344.0,Palaiseau - Villebon,IDFM:63067,80424.0,80.0,0.0
13826,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,Palaiseau - Villebon,IDFM:63067,80454.0,Lozère,IDFM:63029,80582.0,128.0,0.0
13827,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,Lozère,IDFM:63029,80612.0,Le Guichet,IDFM:63025,80708.0,96.0,0.0
13828,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,Le Guichet,IDFM:63025,80738.0,Orsay Ville,IDFM:62892,80831.0,93.0,0.0
13829,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,Orsay Ville,IDFM:62892,80871.0,Bures-sur-Yvette,IDFM:62851,80958.0,87.0,0.0
13830,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,Bures-sur-Yvette,IDFM:62851,80988.0,La Hacquinière,IDFM:62825,81070.0,82.0,0.0
13831,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,La Hacquinière,IDFM:62825,81100.0,Gif-sur-Yvette,IDFM:62890,81183.0,83.0,0.0
13832,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,Gif-sur-Yvette,IDFM:62890,81213.0,Courcelle-sur-Yvette,IDFM:62951,81371.0,158.0,0.0
13833,IDFM:TN:SNCF:088751b1-4273-4ad5-b680-84af98980599,IDFM:C01743,PIER,PILE96,Saint-Rémy-lès-Chevreuse,IDFM:62978,Courcelle-sur-Yvette,IDFM:62951,81401.0,Saint-Rémy-lès-Chevreuse,IDFM:62978,81543.0,142.0,0.0


In [21]:
cleaned_df = cleaned_df.astype({"time_departure":'int', "time_arrival":'int', 'time_travelled':'int', 'direction':'int'})

In [22]:
cleaned_df[cleaned_df.trip_headsign=="USER"]

Unnamed: 0,trip_id,trip_line,trip_headsign,trip_short_name,destination,destination_id,origin,origin_id,time_departure,arrival,arrival_id,time_arrival,time_travelled,direction
9793,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,Aéroport Charles de Gaulle 2 (Terminal 2),IDFM:73699,31010,Aéroport CDG 1 (Terminal 3) - RER,IDFM:73596,31140,130,0
9794,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,Aéroport CDG 1 (Terminal 3) - RER,IDFM:73596,31480,Parc des Expositions,IDFM:73568,31760,280,0
9795,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,Parc des Expositions,IDFM:73568,31790,Villepinte,IDFM:73547,31880,90,0
9796,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,Villepinte,IDFM:73547,31910,Sevran Beaudottes,IDFM:73491,32030,120,0
9797,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,Sevran Beaudottes,IDFM:73491,32080,Aulnay-sous-Bois,IDFM:72646,32240,160,0
9798,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,Aulnay-sous-Bois,IDFM:72646,32290,Le Blanc-Mesnil,IDFM:72648,32390,100,0
9799,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,Le Blanc-Mesnil,IDFM:72648,32420,Drancy,IDFM:72652,32520,100,0
9800,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,Drancy,IDFM:72652,32550,Le Bourget,IDFM:72641,32680,130,0
9801,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,Le Bourget,IDFM:72641,32710,La Courneuve - Aubervilliers,IDFM:72598,32890,180,0
9802,IDFM:TN:SNCF:bd30df92-db61-4136-9a3d-e8a5a28b79d4,IDFM:C01743,USER,USER32,Laplace,IDFM:70427,La Courneuve - Aubervilliers,IDFM:72598,32920,La Plaine Stade de France,IDFM:72211,33040,120,0


In [23]:
cleaned_df.to_csv("../data/processed/Calculated_fields_theorique_.csv", encoding='utf-8-sig', sep=";", index=False)