In [0]:
import pyspark.sql.functions as F
import pickle
from elasticsearch import Elasticsearch
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.types as T
import json
import pandas as pd
import seaborn as sns
from pyspark.sql.functions import isnan, when, count, col
import datetime
from pyspark.sql import Window
from pyspark.sql.functions import monotonically_increasing_id 

In [0]:
schema = StructType([StructField('_id',StructType([StructField('$oid',StringType(),True)]),True),
                     StructField('actualDelay',LongType(),True),
                     StructField('angle',DoubleType(),True),
                     StructField('anomaly',BooleanType(),True),
                     StructField('areaId',LongType(),True),
                     StructField('areaId1',LongType(),True),
                     StructField('areaId2',LongType(),True),
                     StructField('areaId3',LongType(),True),
                     StructField('atStop',BooleanType(),True),
                     StructField('busStop',LongType(),True),
                     StructField('calendar',StructType([StructField('$numberLong',StringType(),True)]),True),
                     StructField('congestion',BooleanType(),True),
                     StructField('currentHour',LongType(),True),
                     StructField('dateType',LongType(),True),
                     StructField('dateTypeEnum',StringType(),True),
                     StructField('delay',LongType(),True),
                     StructField('direction',LongType(),True),
                     StructField('distanceCovered',DoubleType(),True),
                     StructField('ellapsedTime',LongType(),True),
                     StructField('filteredActualDelay',LongType(),True),
                     StructField('gridID',StringType(),True),
                     StructField('journeyPatternId',StringType(),True),
                     StructField('justLeftStop',BooleanType(),True),
                     StructField('justStopped',BooleanType(),True),
                     StructField('latitude',DoubleType(),True),
                     StructField('lineId',StringType(),True),
                     StructField('loc',StructType([StructField('coordinates',ArrayType(DoubleType(),True),True),
                     StructField('type',StringType(),True)]),True),
                     StructField('longitude',DoubleType(),True),
                     StructField('poiId',LongType(),True),
                     StructField('poiId2',LongType(),True),
                     StructField('probability',DoubleType(),True),
                     StructField('systemTimestamp',DoubleType(),True),
                     StructField('timestamp',StructType([StructField('$numberLong',StringType(),True)]),True),
                     StructField('vehicleId',LongType(),True),
                     StructField('vehicleSpeed',LongType(),True)])

In [0]:
file_path = dbutils.widgets.get("file path")
df = spark.read.json(file_path, schema=schema)

In [0]:
spark.conf.set("spark.sql.session.timeZone", "Europe/Dublin")
data = df.select(df["*"],to_timestamp(from_unixtime(substring(df["timestamp.$numberLong"],0,10))).alias("timeAndDate")) 
data = data.select(data["*"], date_format(data['timeAndDate'], 'E').alias('DOW'), hour(data["timeAndDate"]).cast("long").alias("real_hour"))

In [0]:
temp = Row("current_time_1")
temp1 = temp(dbutils.widgets.get("date & time"))
schema = StructType([StructField("current_time_1", StringType(), False)])
df = spark.createDataFrame([temp1], schema)
current_time_df = df.withColumn("current_time",df['current_time_1'].cast(TimestampType())).drop("current_time_1")
display(current_time_df)

current_time
2017-07-03T17:00:00.000+0100


In [0]:
time_settings = current_time_df.withColumn("DOW", date_format(current_time_df['current_time'], 'E')).withColumn("hour", hour(current_time_df["current_time"]).cast("long"))

In [0]:
DOW = time_settings.collect()[0][1]
hour = time_settings.collect()[0][2]
if hour<6:
  part_of_the_day = 'early_morning'
elif hour<12:
  part_of_the_day = 'morning'
elif hour<18:
  part_of_the_day = 'noon'
else:
  part_of_the_day = 'evening'

In [0]:
data_current_time = data.crossJoin(current_time_df)
data_current_time = data_current_time.filter((F.col('timeAndDate') <= F.col('current_time')) & (F.col('timeAndDate') >= F.col('current_time')-F.expr('INTERVAL 3 HOURS')) & (F.col("atStop") == 'true')).withColumn("datetime_diff_sec", (F.col("current_time").cast("long") - F.col("timeAndDate").cast("long")))

In [0]:
# Create a view or table
temp_table_name = "Dublin_BUS"
data_current_time.createOrReplaceTempView(temp_table_name)

In [0]:
max_journey = spark.sql('''
SELECT journeyPatternId, MAX(timeAndDate) AS timeAndDate
FROM Dublin_BUS
GROUP BY journeyPatternId''')

In [0]:
# Create a view or table
temp_table_name = "max_journey"
max_journey.createOrReplaceTempView(temp_table_name)

In [0]:
real_time_bus_locations = spark.sql('''
SELECT t1.journeyPatternId, t1.busStop, t1.timeAndDate, t1.datetime_diff_sec, t1.longitude as bus_current_longitude, t1.latitude as bus_current_latitude
FROM
(SELECT *
FROM Dublin_BUS) t1
INNER JOIN
(SELECT *
FROM max_journey) t2
ON t1.journeyPatternId=t2.journeyPatternId AND t1.timeAndDate=t2.timeAndDate
''')

In [0]:
IP = '10.0.0.21'
es = Elasticsearch([{'host': IP}])

In [0]:
def read_from_elastic(index, query="", scroll_size="10000", array_field=""):
    return spark.read\
                .format("org.elasticsearch.spark.sql")\
                .option("es.nodes.wan.only","true")\
                .option("es.port","9200")\
                .option("es.nodes",IP)\
                .option("es.nodes.client.only", "false")\
                .option("pushdown", "true")\
                .option("es.query", query)\
                .option("es.scroll.size", scroll_size)\
                .option("es.scroll.keepalive", "120m")\
                .option("es.read.field.as.array.include", array_field)\
                .load(index)

In [0]:
bus_stops_estimated_time = read_from_elastic('bus_stops_estimated_time_arrival')

In [0]:
bus_stops_estimated_time = bus_stops_estimated_time.filter((F.col('DOW') == DOW) & (F.col('part_of_the_day') == part_of_the_day))

In [0]:
display(bus_stops_estimated_time)

DOW,from_busStop,journeyPatternId,line,median_time_sec,part_of_the_day,to_busStop
Mon,829,1201001,120,512,noon,80
Mon,829,1201001,120,639,noon,82
Mon,829,1201001,120,798,noon,817
Mon,829,1201001,120,832,noon,819
Mon,829,1201001,120,918,noon,263
Mon,829,1201001,120,958,noon,818
Mon,829,1201001,120,3819,noon,284
Mon,829,1201001,120,4359,noon,4504
Mon,829,1201001,120,4452,noon,7028
Mon,829,1201001,120,4767,noon,30


In [0]:
real_time_to_origin_stop = real_time_bus_locations.join(bus_stops_estimated_time, (real_time_bus_locations["busStop"] == bus_stops_estimated_time["from_busStop"]) & (real_time_bus_locations["journeyPatternId"] == bus_stops_estimated_time["journeyPatternId"])).select(real_time_bus_locations["journeyPatternId"], bus_stops_estimated_time["median_time_sec"], real_time_bus_locations["datetime_diff_sec"], bus_stops_estimated_time["to_busStop"],real_time_bus_locations["bus_current_longitude"], real_time_bus_locations["bus_current_latitude"])

In [0]:
real_time_to_origin_stop = real_time_to_origin_stop.withColumn('real_time_to_origin_stop', F.col("median_time_sec")-F.col("datetime_diff_sec"))

In [0]:
real_time_to_origin_stop = real_time_to_origin_stop.filter(real_time_to_origin_stop["real_time_to_origin_stop"] >= 0).drop("datetime_diff_sec", "median_time_sec").withColumnRenamed('to_busStop', 'from_busStop')

In [0]:
origin_longitude = float(dbutils.widgets.get("origin longitude"))
origin_latitude = float(dbutils.widgets.get("origin latitude"))
destination_longitude = float(dbutils.widgets.get("destination longitude"))
destination_latitude = float(dbutils.widgets.get("destaniation latitude"))

In [0]:
origin_destination_point = sc.parallelize([[origin_latitude, origin_longitude, destination_latitude, destination_longitude]]).toDF(("origin_latitude", "origin_longitude", "destination_latitude", "destination_longitude"))

In [0]:
df_bus_stops = read_from_elastic('bus_stops_data')

In [0]:
# Create a view or table
temp_table_name = "dublin_bus_stops_table"
df_bus_stops.createOrReplaceTempView(temp_table_name)

In [0]:
dublin_bus_stops_table = spark.sql('''
SELECT 
   INT(split(Unique_British_Isles_Id, 'DB')[1]) AS busStop, 
   CONCAT(Name_without_locality, ", ", Name) AS busStopName, 
   POINT_X AS longitude, 
   POINT_Y AS latitude
FROM dublin_bus_stops_table
''')

In [0]:
distance_tables_to_function = dublin_bus_stops_table.crossJoin(origin_destination_point)

In [0]:
distance_tables_origin = distance_tables_to_function.withColumn("a", (
        F.pow(F.sin(F.radians(F.col("latitude") - F.col("origin_latitude")) / 2), 2) +
        F.cos(F.radians(F.col("origin_latitude"))) * F.cos(F.radians(F.col("latitude"))) *
        F.pow(F.sin(F.radians(F.col("longitude") - F.col("origin_longitude")) / 2), 2)
    )).withColumn("origin_distance_meters", F.atan2(F.sqrt(F.col("a")), F.sqrt(-F.col("a") + 1)) * 12742000)

In [0]:
distance_tables_origin_destination = distance_tables_origin.withColumn("b", (
        F.pow(F.sin(F.radians(F.col("latitude") - F.col("destination_latitude")) / 2), 2) +
        F.cos(F.radians(F.col("destination_latitude"))) * F.cos(F.radians(F.col("latitude"))) *
        F.pow(F.sin(F.radians(F.col("longitude") - F.col("destination_longitude")) / 2), 2)
    )).withColumn("destination_distance_meters", F.atan2(F.sqrt(F.col("b")), F.sqrt(-F.col("b") + 1)) * 12742000)

In [0]:
distance_tables_with_time = distance_tables_origin_destination.withColumn('origin_walking_time_seconds',distance_tables_origin_destination["origin_distance_meters"]).withColumn('destination_walking_time_seconds',distance_tables_origin_destination["destination_distance_meters"])
columns_to_drop = ["a", "b"]
distance_time_walking_bus_stops = distance_tables_with_time.drop(*columns_to_drop)


In [0]:
origin_join = distance_time_walking_bus_stops.select(distance_time_walking_bus_stops.busStop, distance_time_walking_bus_stops.busStopName, distance_time_walking_bus_stops.longitude, distance_time_walking_bus_stops.latitude, distance_time_walking_bus_stops.origin_walking_time_seconds)
origin_join = origin_join.withColumnRenamed("busStopName", "originBusStopName")\
                         .withColumnRenamed("longitude", "originLongitude")\
                         .withColumnRenamed("latitude", "originLatitude")

In [0]:
destination = distance_time_walking_bus_stops.select(distance_time_walking_bus_stops.busStop, distance_time_walking_bus_stops.busStopName, distance_time_walking_bus_stops.longitude, distance_time_walking_bus_stops.latitude, distance_time_walking_bus_stops.destination_walking_time_seconds)
destination_join = destination.withColumnRenamed("busStopName", "destinationBusStopName")\
                         .withColumnRenamed("longitude", "destinationLongitude")\
                         .withColumnRenamed("latitude", "destinationLatitude")

In [0]:
joined_table = bus_stops_estimated_time.join(origin_join, bus_stops_estimated_time.from_busStop == origin_join.busStop).drop("busStop")
joined_table = joined_table.join(destination_join, bus_stops_estimated_time.to_busStop == destination_join.busStop).drop("busStop")

In [0]:
joined_table = joined_table.join(real_time_to_origin_stop, ['journeyPatternId', 'from_busStop'])

In [0]:
joined_table = joined_table.filter(F.col("origin_walking_time_seconds")<F.col("real_time_to_origin_stop"))

In [0]:
joined_table = joined_table.withColumn("time_to_wait", F.col("real_time_to_origin_stop") - F.col("origin_walking_time_seconds"))

In [0]:
final_table_to_user = joined_table.withColumn('final_time_estimation', joined_table.median_time_sec + joined_table.origin_walking_time_seconds + joined_table.
destination_walking_time_seconds + joined_table.time_to_wait).orderBy('final_time_estimation').withColumn("travel_option", monotonically_increasing_id()+1)

In [0]:
final_table_to_user2 = final_table_to_user.select(final_table_to_user.travel_option,round(final_table_to_user.final_time_estimation/60, 3).alias("estimated_total_travel_time(min)"), final_table_to_user.line ,final_table_to_user.originBusStopName.alias("from_bus_stop"), 
round(final_table_to_user.origin_walking_time_seconds/60,3).alias("walking_time_to_bus_stop"), round(final_table_to_user.time_to_wait/60,3).alias("time_to_wait_in_bus_stop"), round(final_table_to_user.median_time_sec/60, 3).alias("bus_travel_time"), final_table_to_user.destinationBusStopName.alias("to_bus_stop"), round(final_table_to_user.destination_walking_time_seconds/60, 3).alias("walking_time_to_final_destination"))

In [0]:
display(final_table_to_user2.limit(5))

travel_option,estimated_total_travel_time(min),line,from_bus_stop,walking_time_to_bus_stop,time_to_wait_in_bus_stop,bus_travel_time,to_bus_stop,walking_time_to_final_destination
1,25.503,083A,"Marguerite Road, Botanic Rd",7.864,0.286,13.3,"Capel Street, Uppr Ormond Quay",4.053
2,28.202,083A,"Marguerite Road, Botanic Rd",7.864,0.286,9.117,"Mary's Lane, Church Street",10.935
3,28.356,083A,"Marguerite Road, Botanic Rd",7.864,0.286,12.317,"Chancery Place, Inns Quay",7.89
4,29.571,083A,"Marguerite Road, Botanic Rd",7.864,0.286,15.967,"Lower Liffey Street, Bachelors Walk",5.454
5,30.526,083A,"Marguerite Road, Botanic Rd",7.864,0.286,7.967,"Capuchin Church, Constitution Hl",14.409


In [0]:
tinder_users = sc.parallelize([["Bob", 31, "https://www.instagram.com/discoverdublin/", "2017-07-03 17:20:00", -6.9658641091910585, 53.94606360456374]]).toDF(("Name", "Age", "Instagram", "estimated_arrival_time", "destination_longitude", "destination_latitude"))

In [0]:
IP = '10.0.0.21'
es = Elasticsearch([{'host': IP}])

def write_table_to_elastic(df, schema, table_name: str):
    es.indices.create(index=table_name, ignore=400, body=schema)
    df.write.format("org.elasticsearch.spark.sql")\
        .option("es.resource", table_name)\
        .option("es.nodes.wan.only","true")\
        .option("es.port","9200")\
        .option("es.nodes",IP)\
        .option("es.nodes.client.only", "false")\
        .option("es.batch.write.retry.count", "6")\
        .mode("append")\
        .save()

In [0]:
#write areas_table to Elastic
SCEHMA = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
          "Instagram" : { "type": "url.original" }
        }
    }
}
#write_table_to_elastic(tinder_users, SCEHMA, "tinder_users")

In [0]:
tinder_users = read_from_elastic('tinder_users')

In [0]:
tinder_users = tinder_users.crossJoin(origin_destination_point.withColumnRenamed("destination_latitude", 'current_user_destination_latitude').withColumnRenamed("destination_longitude", 'current_user_destination_longitude')).drop("origin_latitude", 
"origin_longitude")

In [0]:
tinder_users = tinder_users.withColumn("a", (
        F.pow(F.sin(F.radians(F.col("current_user_destination_latitude") - F.col("destination_latitude")) / 2), 2) +
        F.cos(F.radians(F.col("destination_latitude"))) * F.cos(F.radians(F.col("current_user_destination_latitude"))) *
        F.pow(F.sin(F.radians(F.col("current_user_destination_longitude") - F.col("destination_longitude")) / 2), 2)
    )).withColumn("destination_distance_meters", F.atan2(F.sqrt(F.col("a")), F.sqrt(-F.col("a") + 1)) * 12742000)

In [0]:
tinder_users = tinder_users.crossJoin(current_time_df).filter((F.col("destination_distance_meters")<=1000) & \
               (F.to_date(F.col("estimated_arrival_time"))==F.to_date(F.col("current_time")))) \
                .select("Name", "Age", "Instagram", "estimated_arrival_time")

In [0]:
Tinder_mode = dbutils.widgets.get("Tinder Mode")
if Tinder_mode=='ON':
  display(tinder_users.select("Name", "Age", "Instagram"))

Name,Age,Instagram
Alice,31,https://www.instagram.com/discoverdublin/
