In [8]:
import os
list_files = os.listdir("/content/drive/My Drive/Data/Dataroots_nmbs/raw")
for file in sorted(list_files):
  print(file)

calendar.txt
calendar_dates.txt
raw_nmbs_20190111_144433.csv
raw_nmbs_20190111_145146.csv
raw_nmbs_20190111_145954.csv
raw_owm_20190107_091523.jsonl
raw_owm_20190107_100024.jsonl
routes.txt
stop_time_overrides.txt
stop_times.txt
stops.txt
transfers.txt
translations.txt
trips.txt


# Exploring the NMBS data
Now let us explore and understand the NMBS data.
![alt text](https://images.vrt.be/width1280/2018/09/25/6783b2f2-c09e-11e8-abcc-02b7b76bf47f.jpg)
selecting only the trips going through Leuven (and not departing from Leuven).

















Selecting the data that is only going through Leuven. This information is available in the *stops* data set

In [9]:
import pandas as pd

df_stops = pd.read_csv("/content/drive/My Drive/Data/Dataroots_nmbs/raw/stops.txt")
df_stops.head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,platform_code
0,8015345,,Aachen Hbf (DE),,50.77083,6.105277,,,0,,
1,8200100,,Luxembourg (LU),,49.6,6.133333,,,0,,
2,8200101,,Dommeldange (LU),,49.6339,6.136765,,,0,,
3,8200102,,Pfaffenthal-Kirchberg (LU),,49.61913,6.132853,,,0,,
4,8200110,,Mersch (LU),,49.74889,6.106111,,,0,,


With regular expression, we can look for Leuven in the *stop_name* column and we get a *stop_id*

In [10]:
import re


# What is Leuven stop
def return_leuven(text):
  """Get Louvain Stops.

  params text: string - city name
  return : Louvain if Louvain and other otherwise
  """
  if text == "Louvain":
    return "Louvain"
  else:
    return "other"


df_stops["selected"] = df_stops.apply(lambda row:
                                      return_leuven(row["stop_name"]), axis=1)
df_stops[df_stops.selected != "other"].head()

Unnamed: 0,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,platform_code,selected
874,S8833001,,Louvain,,50.88228,4.715868,,,1,,,Louvain
875,8833001_4,,Louvain,,50.88228,4.715868,,,0,S8833001,4.0,Louvain
876,8833001_1,,Louvain,,50.88228,4.715868,,,0,S8833001,1.0,Louvain
877,8833001_5,,Louvain,,50.88228,4.715868,,,0,S8833001,5.0,Louvain
878,8833001_3,,Louvain,,50.88228,4.715868,,,0,S8833001,3.0,Louvain


In [11]:
stops_selected = list(set(df_stops[df_stops.selected != "other"].stop_id))
stops_selected

['8833001_1       ',
 '8833001_5       ',
 '8833001_9       ',
 '8833001_C       ',
 '8833001_7       ',
 '8833001_2       ',
 '8833001_6       ',
 '8833001_B       ',
 '8833001',
 '8833001_A       ',
 '8833001_D       ',
 'S8833001',
 '8833001_8       ',
 '8833001_4       ',
 '8833001_3       ']

Now let's see if we can find all the trips going through Leuven. From the *stop_times* data, we can get for any *stop_id*, the *trip_id*. Hence based on the station id that we just gathered, we can filter only the trip going through Leuven. 

In [12]:
df_stops_times = pd.read_csv("/content/drive/My Drive/Data/Dataroots_nmbs/raw/stop_times.txt")
df_stops_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
0,88____:A71::8821402:8400526:3:650:20181208,06:43:00,06:43:00,8821402,1,,0,1,
1,88____:A71::8821402:8400526:3:650:20181208,06:44:00,06:44:00,8829009,2,,1,1,
2,88____:A71::8821402:8400526:3:650:20181208,06:50:00,06:50:00,8400526,3,,1,0,
3,84____:A71::8400526:8829009:2:727:20181208,07:21:00,07:21:00,8400526,1,,0,1,
4,84____:A71::8400526:8829009:2:727:20181208,07:27:00,07:27:00,8829009,2,,1,1,


In [13]:

df_stops_times["stop_id"] = df_stops_times["stop_id"].astype("str")

df_stops_times["trips_selected"] = df_stops_times.stop_id.isin(stops_selected)

df_stops_times[df_stops_times.trips_selected == True].head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,trips_selected
1248,88____:049::8833001:8833605:6:820:20190210,07:00:00,07:00:00,8833001,1,,0,1,,True
1260,88____:049::8833001:8833605:6:920:20190210,08:00:00,08:00:00,8833001,1,,0,1,,True
1277,88____:049::8833001:8833605:6:1020:20190210,09:00:00,09:00:00,8833001,1,,0,1,,True
1289,88____:049::8833001:8833605:6:1120:20190210,10:00:00,10:00:00,8833001,1,,0,1,,True
1306,88____:049::8833001:8833605:6:1220:20190210,11:00:00,11:00:00,8833001,1,,0,1,,True


In [14]:
trips_selected = set(df_stops_times[df_stops_times.trips_selected].trip_id)
print("number of trips going through Leuven is", len(trips_selected))

number of trips going through Leuven is 2204


We now want more information on the trip, what do these numbers mean? We find more information in the *routes* and in the *trips*

In [15]:
df_trips = pd.read_csv("/content/drive/My Drive/Data/Dataroots_nmbs/raw/trips.txt")
df_trips.tail()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,trip_type
13968,470,616,88____:005::8814001:8727100:3:1850:20191213,Paris Nord (FR),9606,,11751,,1
13969,470,48,87____:005::8727100:8814308:2:2247:20181207,Bruxelles-Midi,9615,,11752,,1
13970,470,48,88____:005::8814308:8814001:2:2254:20181207,Bruxelles-Midi,9615,,11752,,1
13971,470,617,87____:005::8727100:8814308:2:2205:20191213,Bruxelles-Midi,9615,,11753,,1
13972,470,617,88____:005::8814308:8814001:2:2212:20191213,Bruxelles-Midi,9615,,11753,,1


In [16]:
df_trips["trips_selected"] = df_trips.trip_id.isin(trips_selected)
routes_selected = set(df_trips[df_trips.trips_selected].route_id)
print("number of routes going through Leuven", len(routes_selected))


number of routes going through Leuven 56


Finally from routes, we have some information regarding the routes

In [17]:
df_routes = pd.read_csv("/content/drive/My Drive/Data/Dataroots_nmbs/raw/routes.txt")
df_routes.head()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
0,1,NMBS/SNCB,S32,Roosendaal (NL) -- Essen,,100,,,
1,10,NMBS/SNCB,BUS,Hasselt -- Mol,,700,,,
2,100,NMBS/SNCB,IC,Tongres -- Gand-Saint-Pierre,,103,,,
3,101,NMBS/SNCB,IC,Gand-Saint-Pierre -- Lokeren,,103,,,
4,102,NMBS/SNCB,IC,Hasselt -- Gand-Saint-Pierre,,103,,,


We now have all the routes passing through Leuven.

In [18]:
df_routes["routes_selected"] = df_routes.route_id.isin(routes_selected)

df_routes[df_routes.routes_selected].head()

Unnamed: 0,route_id,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color,routes_selected
17,114,NMBS/SNCB,L,Hasselt -- Louvain,,100,,,,True
31,127,NMBS/SNCB,IC,Anvers-Central -- Louvain,,103,,,,True
32,128,NMBS/SNCB,IC,Anvers-Central -- Hasselt,,103,,,,True
33,129,NMBS/SNCB,IC,Anvers-Central -- Tongres,,103,,,,True
34,13,NMBS/SNCB,BUS,Louvain -- Landen,,700,,,,True


We can reconstruct one trip, let's say the *Anvers-Central -- Tongres*, *route_id*=129, train *IC*

Take the first trip from that route and find all the stops from that trip. Then join with the stops names.



In [19]:
trips = set(df_trips[df_trips.route_id == 129].trip_id)
trip = list(trips)[0]
df_stops_times[df_stops_times.trip_id == trip].merge(df_stops[["stop_id", "stop_name"]])

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,trips_selected,stop_name
0,88____:007::8831310:8821006:28:812:20191213,06:12:00,06:12:00,8831310,1,,0,1,,False,Tongres
1,88____:007::8831310:8821006:28:812:20191213,06:21:00,06:21:00,8831138,2,,0,0,,False,Bilzen
2,88____:007::8831310:8821006:28:812:20191213,06:28:00,06:28:00,8831112,3,,0,0,,False,Diepenbeek
3,88____:007::8831310:8821006:28:812:20191213,06:35:00,06:38:00,8831005,4,,0,0,,False,Hasselt
4,88____:007::8831310:8821006:28:812:20191213,06:46:00,06:46:00,8831088,5,,1,1,,False,Schulen
5,88____:007::8831310:8821006:28:812:20191213,06:52:00,06:53:00,8831401,6,,0,0,,False,Diest
6,88____:007::8831310:8821006:28:812:20191213,06:57:00,06:57:00,8833274,7,,1,1,,False,Zichem
7,88____:007::8831310:8821006:28:812:20191213,06:58:00,06:58:00,8833266,8,,1,1,,False,Testelt
8,88____:007::8831310:8821006:28:812:20191213,07:01:00,07:01:00,8833258,9,,1,1,,False,Langdorp
9,88____:007::8831310:8821006:28:812:20191213,07:05:00,07:07:00,8833209,10,,0,0,,False,Aarschot



Now that we understand the data, we can join the different interesting part of it, to have a final schedule, that can be the basis for checking train delay.



In [0]:
df_stops = df_stops[["stop_id", "stop_name", "stop_lat", "stop_lon"]]
df_trips = df_trips[["route_id", "service_id", "trip_id", "trip_headsign"]]
df_routes = df_routes[["route_id", "route_short_name", "route_long_name"]]
df_stops_times = df_stops_times[["trip_id", "arrival_time", "departure_time",
                                 "stop_id", "stop_sequence", "pickup_type",
                                 "drop_off_type"]]


df = df_stops_times.merge(df_stops).merge(df_trips).merge(df_routes)

In [21]:
df.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,stop_name,stop_lat,stop_lon,route_id,service_id,trip_headsign,route_short_name,route_long_name
0,88____:A71::8821402:8400526:3:650:20181208,06:43:00,06:43:00,8821402,1,0,1,Essen,51.46276,4.451318,1,1,Roosendaal (NL),S32,Roosendaal (NL) -- Essen
1,88____:A71::8821402:8400526:3:650:20181208,06:44:00,06:44:00,8829009,2,1,1,Essen-Grens,51.46909,4.44893,1,1,Roosendaal (NL),S32,Roosendaal (NL) -- Essen
2,88____:A71::8821402:8400526:3:650:20181208,06:50:00,06:50:00,8400526,3,1,0,Roosendaal (NL),51.54083,4.458692,1,1,Roosendaal (NL),S32,Roosendaal (NL) -- Essen
3,88____:A71::8829009:8821402:2:2259:20181207,22:59:00,22:59:00,8821402,3,1,0,Essen,51.46276,4.451318,1,4,Essen,S32,Roosendaal (NL) -- Essen
4,88____:A71::8829009:8821402:2:2259:20181207,22:56:00,22:56:00,8829009,2,1,1,Essen-Grens,51.46909,4.44893,1,4,Essen,S32,Roosendaal (NL) -- Essen
