In [29]:
from geopy.distance import distance as geo_dist

In [30]:
import getpass
import pyspark
from pyspark.sql import SparkSession

conf = pyspark.conf.SparkConf()
conf.setMaster('yarn')
conf.setAppName('final_project-{0}'.format(getpass.getuser()))
conf.set('spark.executor.memory', '4g')
conf.set('spark.executor.instances', '6')
conf.set('spark.port.maxRetries', '100')
sc = pyspark.SparkContext.getOrCreate(conf)
conf = sc.getConf()
sc


In [31]:
spark = SparkSession(sc)

In [32]:
zurich_geo = (47.378177, 8.540192)

In [33]:
stops = sc.textFile("/datasets/project/metadata/BFKOORD_GEO")
stops = stops.map(lambda x: x.split()).map(lambda x: (int(x[0]), float(x[2]), float(x[1]), int(x[3]), " ".join(x[5:])))
stops = stops.map(lambda x: (*x, geo_dist(zurich_geo, (x[1], x[2])).km ))
stops = stops.filter(lambda x: x[5] <= 10)

In [64]:
stops_df = stops.toDF().toPandas()

In [65]:
stops_df.columns = ['id', 'latitude', 'longitude', 'elevation', 'station_name', 'dist_to_zurich_HB']

In [66]:
stops_df.to_csv("zurich_hb_stops.csv", index=False, encoding='utf-8')

In [70]:
stops_df.loc[stops_df.station_name == 'Hochschulen'].head()

Unnamed: 0,id,latitude,longitude,elevation,station_name,dist_to_zurich_HB
106,8530471,47.376858,8.546402,446,Hochschulen,0.491363


In [71]:
stops_df.loc[stops_df.station_name == 'Zürich Polyterrasse'].head()

Unnamed: 0,id,latitude,longitude,elevation,station_name,dist_to_zurich_HB
77,8503500,47.376858,8.546402,446,Zürich Polyterrasse,0.491363


In [72]:
x = stops.map(lambda x : (x[0], x[1], x[2],x[3],x[4]))\
    .toDF(("id", "latitude", "longitude",'elevation','station_name'))

In [73]:
x = x.filter(x.station_name != 'Hochschulen')

In [74]:
x2 = stops.map(lambda x : (x[0], x[1], x[2],x[3],x[4]))\
    .toDF(("id2", "latitude2", "longitude2",'elevation2','station_name2'))

In [76]:
x2 = x2.filter(x2.station_name2 != 'Hochschulen')

In [55]:
x.select('id','latitude','longitude','elevation','station_name').show()

+-------+---------+---------+---------+--------------------+
|     id| latitude|longitude|elevation|        station_name|
+-------+---------+---------+---------+--------------------+
|    176|47.351679| 8.521961|        0|Zimmerberg-Basist...|
|8502220|47.390882| 8.434713|      442|              Urdorf|
|8502221|47.357432| 8.437543|      488|      Birmensdorf ZH|
|8502222|47.325896| 8.468175|      528| Bonstetten-Wettswil|
|8502229|47.380971|  8.43033|      456|   Urdorf Weihermatt|
|8502559|47.368305| 8.463472|      588|Waldegg, Birmensd...|
|8502572|47.370293| 8.513918|      421|Zürich, Goldbrunn...|
|8502876|47.338209| 8.438705|      537|Aesch ZH, Gemeind...|
|8502885|47.315088| 8.467781|      528|Bonstetten, Dorfp...|
|8502950|47.353936| 8.437173|      468|Birmensdorf ZH, Z...|
|8503000|47.378177| 8.540192|      408|           Zürich HB|
|8503001|47.391481|  8.48894|      399|   Zürich Altstetten|
|8503003|47.366611| 8.548466|      411|  Zürich Stadelhofen|
|8503004|47.350124| 8.56

In [77]:
y = x.crossJoin(x2)
y.select('id',
         'latitude',
         'longitude',
         'elevation',
         'station_name',
         'id2',
         'latitude2',
         'longitude2',
         'elevation2',
         'station_name2').show()

+---+---------+---------+---------+--------------------+-------+---------+----------+----------+--------------------+
| id| latitude|longitude|elevation|        station_name|    id2|latitude2|longitude2|elevation2|       station_name2|
+---+---------+---------+---------+--------------------+-------+---------+----------+----------+--------------------+
|176|47.351679| 8.521961|        0|Zimmerberg-Basist...|    176|47.351679|  8.521961|         0|Zimmerberg-Basist...|
|176|47.351679| 8.521961|        0|Zimmerberg-Basist...|8502220|47.390882|  8.434713|       442|              Urdorf|
|176|47.351679| 8.521961|        0|Zimmerberg-Basist...|8502221|47.357432|  8.437543|       488|      Birmensdorf ZH|
|176|47.351679| 8.521961|        0|Zimmerberg-Basist...|8502222|47.325896|  8.468175|       528| Bonstetten-Wettswil|
|176|47.351679| 8.521961|        0|Zimmerberg-Basist...|8502229|47.380971|   8.43033|       456|   Urdorf Weihermatt|
|176|47.351679| 8.521961|        0|Zimmerberg-Basist...|

In [78]:
z = y.filter(y.id!=y.id2)

In [79]:
w = z.withColumn('elevation_diff', ((z['elevation2'])-(z['elevation']))/1000)


In [18]:
w.count()

1078482

In [80]:
slope = w.rdd.map(lambda x:(x['id'],x['station_name'],x['id2'],x['station_name2'],x['elevation_diff'],( geo_dist((x['latitude'],x['longitude']),(x['latitude2'],x['longitude2'])).km)))


In [96]:
slope.take(2)

[(176, 'Zimmerberg-Basistunnel', 8502220, 'Urdorf', 0.442, 7.900620499193736),
 (176,
  'Zimmerberg-Basistunnel',
  8502221,
  'Birmensdorf ZH',
  0.488,
  6.409889233734277)]

In [81]:
slope_df = slope.map(lambda x:(x[0],x[1],x[2],x[3],x[4],x[5])).toDF(('id','station_name','id2','station_name2','elevation_diff','horizontal_distance'))

In [98]:
slope_df.show()

+---+--------------------+-------+--------------------+--------------+-------------------+
| id|        station_name|    id2|       station_name2|elevation_diff|horizontal_distance|
+---+--------------------+-------+--------------------+--------------+-------------------+
|176|Zimmerberg-Basist...|8502220|              Urdorf|         0.442|  7.900620499193736|
|176|Zimmerberg-Basist...|8502221|      Birmensdorf ZH|         0.488|  6.409889233734277|
|176|Zimmerberg-Basist...|8502222| Bonstetten-Wettswil|         0.528|  4.973881682664052|
|176|Zimmerberg-Basist...|8502229|   Urdorf Weihermatt|         0.456|   7.64919276254611|
|176|Zimmerberg-Basist...|8502559|Waldegg, Birmensd...|         0.588|  4.789534136228712|
|176|Zimmerberg-Basist...|8502572|Zürich, Goldbrunn...|         0.421| 2.1568139712470535|
|176|Zimmerberg-Basist...|8502876|Aesch ZH, Gemeind...|         0.537|  6.467032600536777|
|176|Zimmerberg-Basist...|8502885|Bonstetten, Dorfp...|         0.528|   5.77221601567293|

In [82]:
import pyspark.sql.functions as functions
w = slope_df.withColumn('slope_angle', functions.toDegrees(functions.tanh(slope_df['elevation_diff']/slope_df['horizontal_distance'])))

In [100]:
w.show()

+---+--------------------+-------+--------------------+--------------+-------------------+------------------+
| id|        station_name|    id2|       station_name2|elevation_diff|horizontal_distance|       slope_angle|
+---+--------------------+-------+--------------------+--------------+-------------------+------------------+
|176|Zimmerberg-Basist...|8502220|              Urdorf|         0.442|  7.900620499193736|3.2020708745119815|
|176|Zimmerberg-Basist...|8502221|      Birmensdorf ZH|         0.488|  6.409889233734277| 4.353654749320563|
|176|Zimmerberg-Basist...|8502222| Bonstetten-Wettswil|         0.528|  4.973881682664052| 6.059461876618016|
|176|Zimmerberg-Basist...|8502229|   Urdorf Weihermatt|         0.456|   7.64919276254611| 3.411597787356277|
|176|Zimmerberg-Basist...|8502559|Waldegg, Birmensd...|         0.588|  4.789534136228712| 6.998942810880068|
|176|Zimmerberg-Basist...|8502572|Zürich, Goldbrunn...|         0.421| 2.1568139712470535|11.043960418569181|
|176|Zimme

In [83]:
def return_speed(theta):
    if (theta >=0 and theta <0.1):
        return 4.5
    elif (theta>=0.1 and theta<2.5):
        return 4
    elif (theta>=2.5 and theta<5):
        return 3
    elif (theta>=5 and theta<7.5):
        return 2.5
    elif (theta>=7.5 and theta<10):
        return 2
    elif (theta>=10):
        return 0.4
    else:
        return 6



In [84]:
w.filter(w.slope_angle.isNull()).show()

+---+------------+---+-------------+--------------+-------------------+-----------+
| id|station_name|id2|station_name2|elevation_diff|horizontal_distance|slope_angle|
+---+------------+---+-------------+--------------+-------------------+-----------+
+---+------------+---+-------------+--------------+-------------------+-----------+



In [85]:
from pyspark.sql.functions import udf
speed_udf = udf(return_speed)
w = w.withColumn('speed',speed_udf(w.slope_angle))

In [86]:
w = w.withColumn('walking_time',w['horizontal_distance']/w['speed'])

In [87]:
w = w.withColumn('walk_minutes', functions.round(w.walking_time * 60))

In [48]:
w.show(5)

+---+--------------------+-------+--------------------+--------------+-------------------+------------------+-----+------------------+------------+
| id|        station_name|    id2|       station_name2|elevation_diff|horizontal_distance|       slope_angle|speed|      walking_time|walk_minutes|
+---+--------------------+-------+--------------------+--------------+-------------------+------------------+-----+------------------+------------+
|176|Zimmerberg-Basist...|8502220|              Urdorf|         0.442|  7.900620499193736|3.2020708745119815|    3| 2.633540166397912|       158.0|
|176|Zimmerberg-Basist...|8502221|      Birmensdorf ZH|         0.488|  6.409889233734277| 4.353654749320563|    3| 2.136629744578092|       128.0|
|176|Zimmerberg-Basist...|8502222| Bonstetten-Wettswil|         0.528|  4.973881682664052| 6.059461876618016|  2.5|1.9895526730656208|       119.0|
|176|Zimmerberg-Basist...|8502229|   Urdorf Weihermatt|         0.456|   7.64919276254611| 3.411597787356277|   

In [46]:
w.count()

1078482

In [88]:
w_pandas = w.toPandas()

In [89]:
w_pandas.head()

Unnamed: 0,id,station_name,id2,station_name2,elevation_diff,horizontal_distance,slope_angle,speed,walking_time,walk_minutes
0,176,Zimmerberg-Basistunnel,8502220,Urdorf,0.442,7.90062,3.202071,3.0,2.63354,158.0
1,176,Zimmerberg-Basistunnel,8502221,Birmensdorf ZH,0.488,6.409889,4.353655,3.0,2.13663,128.0
2,176,Zimmerberg-Basistunnel,8502222,Bonstetten-Wettswil,0.528,4.973882,6.059462,2.5,1.989553,119.0
3,176,Zimmerberg-Basistunnel,8502229,Urdorf Weihermatt,0.456,7.649193,3.411598,3.0,2.549731,153.0
4,176,Zimmerberg-Basistunnel,8502559,"Waldegg, Birmensdorferstrasse",0.588,4.789534,6.998943,2.5,1.915814,115.0


In [90]:
w_pandas.to_csv('walking_time_table.csv')

In [91]:
import pandas as pd

In [93]:
walking_pandas = pd.DataFrame.from_csv('walking_time_table.csv')

In [94]:
walking_pandas.head()

Unnamed: 0,id,station_name,id2,station_name2,elevation_diff,horizontal_distance,slope_angle,speed,walking_time,walk_minutes
0,176,Zimmerberg-Basistunnel,8502220,Urdorf,0.442,7.90062,3.202071,3.0,2.63354,158.0
1,176,Zimmerberg-Basistunnel,8502221,Birmensdorf ZH,0.488,6.409889,4.353655,3.0,2.13663,128.0
2,176,Zimmerberg-Basistunnel,8502222,Bonstetten-Wettswil,0.528,4.973882,6.059462,2.5,1.989553,119.0
3,176,Zimmerberg-Basistunnel,8502229,Urdorf Weihermatt,0.456,7.649193,3.411598,3.0,2.549731,153.0
4,176,Zimmerberg-Basistunnel,8502559,"Waldegg, Birmensdorferstrasse",0.588,4.789534,6.998943,2.5,1.915814,115.0


In [28]:
sc.stop()