<a href="https://colab.research.google.com/github/PedroTechy/CarrisInsight/blob/streaming_development/spark_jobs/extract_carris_vehicles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
import utils as utils
from pyspark.sql.functions import  col, max
from datetime import  timedelta

In [2]:
spark = SparkSession.builder.appName("StreamingApp").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/24 12:39:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/24 12:39:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Step 1: Read and verify results

In [3]:
# Get stop location data as dataframe
stops = utils.get_stops(spark)

In [27]:
# Read current parquet files
df = spark.read.format("parquet").load("content/lake/processing/vehicles/data")
df.count()

                                                                                

8708

In [23]:
max_timestamp = df.select(max("last_timestamp")).collect()[0][0]
minutes_window = max_timestamp - timedelta(seconds=120)
minutes_window

                                                                                

datetime.datetime(2025, 1, 22, 10, 48, 59)

In [24]:
df.show(2)


+-------+--------------------+--------------------+--------------+--------+-------+-------------------+-------------------+---------+---------+--------+---------+------------------+----------+-----------------+
|     id|             trip_id|              window|current_status|route_id|stop_id|    first_timestamp|     last_timestamp|first_lat|first_lon|last_lat| last_lon|          distance|time_delta|    average_speed|
+-------+--------------------+--------------------+--------------+--------+-------+-------------------+-------------------+---------+---------+--------+---------+------------------+----------+-----------------+
|41|1163|1512_0_2_1030_105...|{2025-01-22 10:48...|    STOPPED_AT|  1512_0| 171118|2025-01-22 10:49:18|2025-01-22 10:50:21| 38.83423| -9.30891|38.83009|-9.303944| 7304.955234146504|        63|417426.0133798002|
|41|1244|1209_1_2_1030_105...|{2025-01-22 10:48...| IN_TRANSIT_TO|  1209_1| 170457|2025-01-22 10:48:43|2025-01-22 10:48:43|  38.7678|-9.298641| 38.7678|-9.2

## Step 2: get only last 2 minutes and calculate eta till next stop


In [26]:
import pyspark.sql.functions as F

view = (df
 .filter(col("last_timestamp") > minutes_window)
 .join(stops, how='left', on='stop_id')
 .withColumn("radian",  F.radians(col('last_lat')))
# .transform(haversine_without_udf, 'last_lat', 'last_lon', 'stop_lat', 'stop_lon')
  .withColumn("distance_till_stop", utils.haversine_udf(col("last_lat"), col('last_lon'), col('stop_lat'), col('stop_lon')))
.withColumn(
                   "time_delta", col("last_timestamp").cast("long") -
                   col("first_timestamp").cast("long")
               ).withColumn("average_speed", col("distance") / (col("time_delta") / 3600))
 .withColumn("stop_eta", col("distance_till_stop") / col("average_speed") *60)
 )
view.show()





+-------+--------+--------------------+--------------------+--------------+--------+-------------------+-------------------+---------+---------+---------+---------+------------------+----------+------------------+---------+---------+------------------+------------------+--------------------+
|stop_id|      id|             trip_id|              window|current_status|route_id|    first_timestamp|     last_timestamp|first_lat|first_lon| last_lat| last_lon|          distance|time_delta|     average_speed| stop_lat| stop_lon|            radian|distance_till_stop|            stop_eta|
+-------+--------+--------------------+--------------------+--------------+--------+-------------------+-------------------+---------+---------+---------+---------+------------------+----------+------------------+---------+---------+------------------+------------------+--------------------+
| 070502| 42|2347|2733_0_1|1|1|1015...|{2025-01-22 10:49...|    STOPPED_AT|  2733_0|2025-01-22 10:49:01|2025-01-22 10:50:

                                                                                

In [73]:
view.filter(col('id') =='41|1205').show()




+-------+-------+--------------------+--------------------+--------------+--------+-------------------+-------------------+---------+---------+---------+---------+----------+----------+------------------+---------+---------+------------------+-------------------+
|stop_id|     id|             trip_id|              window|current_status|route_id|    first_timestamp|     last_timestamp|first_lat|first_lon| last_lat| last_lon|  distance|time_delta|     average_speed| stop_lat| stop_lon|distance_till_stop|           stop_eta|
+-------+-------+--------------------+--------------------+--------------+--------+-------------------+-------------------+---------+---------+---------+---------+----------+----------+------------------+---------+---------+------------------+-------------------+
| 170655|41|1205|1622_0_1_1600_162...|{2025-01-19 16:18...| IN_TRANSIT_TO|  1622_0|2025-01-19 16:18:47|2025-01-19 16:20:20| 38.76906| -9.30184| 38.77006|-9.306187|0.39301717|        93| 15.21356797987415|38.7

                                                                                