In [1]:
%%local
import os
username = os.environ['RENKU_USERNAME']
get_ipython().run_cell_magic('configure', line="-f", cell='{ "name":"%s-final-istaden", "executorMemory":"4G", "executorCores":4, "numExecutors":10, "driverMemory": "4G" }' % username)

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
6324,application_1652960972356_1733,pyspark,idle,Link,Link,,
6328,application_1652960972356_1739,pyspark,idle,Link,Link,,
6329,application_1652960972356_1740,pyspark,idle,Link,Link,,
6330,application_1652960972356_1742,pyspark,idle,Link,Link,,
6331,application_1652960972356_1743,pyspark,idle,Link,Link,,
6332,application_1652960972356_1744,pyspark,idle,Link,Link,,
6334,application_1652960972356_1746,pyspark,idle,Link,Link,,
6335,application_1652960972356_1747,pyspark,idle,Link,Link,,
6336,application_1652960972356_1748,pyspark,busy,Link,Link,,
6338,application_1652960972356_1750,pyspark,idle,Link,Link,,


In [2]:
%%send_to_spark -i username -t str -n username

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
6346,application_1652960972356_1760,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Successfully passed 'username' as 'username' to Spark kernel

In [3]:
import pyspark.sql.functions as F

# Visualizing delay distributions
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (30,8)
plt.rcParams['font.size'] = 12
plt.style.use('fivethirtyeight')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
df = spark.read.orc('/data/sbb/orc/istdaten')
df = df.selectExpr(  'betriebstag as date',
                     'produkt_id as transport_type',
                     'haltestellen_name as stop_name',
                     'ankunftszeit as arrival_scheduled',
                     'an_prognose as arrival_actual',
                     'an_prognose_status as delay_type',
                     'durchfahrt_tf as stop_skip',
                     'bpuic as stop_id',
                     'linien_text as line_name',
                     'verkehrsmittel_text as line_type',
                     'linien_id as line_id'
                  )

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Reachable stops with a 15km radius of Zurich HB
reachable_stops_path = "/user/tshen/final-assn/parquet/stop_times"
stop_id_reachable = spark.read.parquet(reachable_stops_path)
stop_id_reachable = stop_id_reachable.withColumn("stop_id",F.split(stop_id_reachable.stop_id,':')[0]).drop("stop_name")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
reachable_stops_path

In [6]:
# Keep only rows where measured time of arrival is "GESCHAETZT" 
df = df.filter('an_prognose_status = "GESCHAETZT"')

# convert day of the trips between
df = df.withColumn('date', F.to_timestamp('date', "dd.MM.yyyy"))
df = df.withColumn('arrival_scheduled', F.to_timestamp('arrival_scheduled', "dd.MM.yyyy HH:mm"))
df = df.withColumn('arrival_actual', F.to_timestamp('arrival_actual', 'dd.MM.yyyy HH:mm:ss'))    

# keep only rows for stops that are not skipped
df = df.filter(df.stop_skip == False)

# keep only rows for stops during the week
df = df.withColumn("day_of_week",F.dayofweek(df.date))
df = df.filter(df.day_of_week.between(2,6))

# hours between 8am and 8pm
min_day_hour, max_day_hour = 8, 20
df = df.filter(F.hour(F.col('arrival_scheduled')).cast('int').between(min_day_hour, max_day_hour))

# only keep stops within the 15km radius
df = df.join(stop_id_reachable,on="stop_id")

# add hour column
df = df.withColumn("hour",F.hour(F.col("arrival_scheduled")))

# Remove the rows where transport_type is null
df = df.where((F.col("transport_type")=="Tram") | (F.col("transport_type")=="Zug") | (F.col("transport_type")=="Bus")).cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
# Helper function to remove negative delays
@F.udf
def relu(x):
    return max(x, 0)

df = df.withColumn("delay", relu((F.col("arrival_actual").cast("long") - F.col("arrival_scheduled").cast("long"))))
df = df.where(~(F.col('delay').isNull())).cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
# statistics type of transport average delays
transport_avg_delays = df.groupby("transport_type").agg(F.mean('delay')).cache()
transport_avg_delays = transport_avg_delays.withColumnRenamed("avg(delay)", "avg_delay")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
hour_avg_delays = df.groupby("hour").agg(F.mean('delay')).cache()
hour_avg_delays = hour_avg_delays.withColumnRenamed("avg(delay)","avg_delay")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
# More to less general cases
default = df.select(F.mean('delay').alias('avg_delay')).cache()
t_type_line = df.groupBy('transport_type', 'line_name').agg(F.mean('delay').alias('avg_delay')).cache()
t_type_line_hour = df.groupby("transport_type", "line_name", "hour").agg(F.mean('delay').alias('avg_delay')).cache()
t_type_line_hour_stop = df.groupby("transport_type", "line_name", "hour", "stop_id").agg(F.mean('delay').alias('avg_delay')).cache()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…