<a href="https://colab.research.google.com/github/PedroTechy/CarrisInsight/blob/streaming_development/spark_jobs/extract_carris_vehicles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Authenticate with Google Cloud


In [1]:
!gcloud auth application-default login

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=fgQqhZkl3IXCOxXHYO1XZv60nOVHBA&prompt=consent&token_usage=remote&access_type=offline&code_challenge=KUmO7jgujuJebhujuAgYhwNI9e7vKYTAUkfqajZY4BM&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0AanRRrsMdFapIEVeS6uebzjzBjz-n92zT-HgdyqeytGXr66VxX-4-tQw_qung0ozDL1GVA

Credentials saved to file: [/content/.config/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).
Ca

# Step 2: Install Spark

In [2]:
!pip install pyspark



# Step 3: Setup Spark Env and Imports

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import min, max, first, last, col, window, from_unixtime, to_timestamp, count, udf


In [4]:
spark = SparkSession.builder \
    .appName('pyspark-run-with-gcp-bucket') \
    .config("spark.jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()

gs_input_path = "gs://edit-de-project-streaming-data/carris-vehicles"
spark._jsc.hadoopConfiguration().set("google.cloud.auth.service.account.json.keyfile", "/content/.config/application_default_credentials.json")

In [14]:
#Cleaning the path to ensure clean directories
!rm -rf content/lake/

# Step 4: Define Schema and User Defined Functions

In [6]:
# Define the schema for your JSON files

schema = StructType([
    StructField("bearing", FloatType(), True),
    StructField("block_id", StringType(), True),
    StructField("current_status", StringType(), True),
    StructField("id", StringType(), True),
    StructField("lat", FloatType(), True),
    StructField("line_id", StringType(), True),
    StructField("lon", FloatType(), True),
    StructField("pattern_id", StringType(), True),
    StructField("route_id", StringType(), True),
    StructField("schedule_relationship", StringType(), True),
    StructField("shift_id", StringType(), True),
    StructField("speed", FloatType(), True),
    StructField("stop_id", StringType(), True),
    StructField("timestamp", LongType(), True),
    StructField("trip_id", StringType(), True)
])

In [7]:
import math
def haversine(lat1, lon1, lat2, lon2):
    try:
      # Earth radius in kilometers
      R = 6371.0

      # Convert latitude and longitude from degrees to radians
      lat1_rad, lon1_rad = math.radians(lat1), math.radians(lon1)
      lat2_rad, lon2_rad = math.radians(lat2), math.radians(lon2)

      # Differences
      delta_lat = lat2_rad - lat1_rad
      delta_lon = lon2_rad - lon1_rad

      # Haversine formula
      a = math.sin(delta_lat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon / 2)**2
      c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

      # Distance
      distance = R * c
      return distance
    except:
      return 0

haversine_udf = udf(haversine, FloatType())



In [8]:
import requests
import json
def get_stop_location(stop_id):
  try:
    url = f"https://api.carrismetropolitana.pt/stops/{stop_id}"
    response = requests.get(url)

    json.loads(response.text)
    print(float(response.json()['lat']), response.json()['lon'])
    return (float(response.json()['lat']),float(response.json()['lon']))
  except KeyError:
    return 0

calculate_stop_location = udf(get_stop_location, StructType([
    StructField("lat", FloatType(), False),
    StructField("lon", FloatType(), False)
]))



## Step 5: Start streaming and transformation process

In [22]:
df = (spark.readStream.option("maxFilesPerTrigger", 1)
    .format("json")
    .schema(schema)
    .load(gs_input_path))
print("will start streaming")


transformed_df = df.withColumn("timestamp", to_timestamp(from_unixtime("timestamp")))
# Write a df with the datetype transform only
print("Transformed data")


window_spec = window("timestamp", "2 minutes", "10 seconds")
# Group by vehicle ID and window, then get the first and last timestamps and lat/lon values
result_df = (
    transformed_df
        .withWatermark("timestamp", "3 minutes")
    .groupBy("id", "trip_id", window_spec)
    .agg(
        max("current_status").alias("current_status"),
        max("route_id").alias("route_id"),
        max("stop_id").alias("stop_id"),
        min("timestamp").alias("first_timestamp"),
        max("timestamp").alias("last_timestamp"),
        first("lat").alias("first_lat"),
        first("lon").alias("first_lon"),
        last("lat").alias("last_lat"),
        last("lon").alias("last_lon")
    ).withColumn(
        "distance",
        haversine_udf(col("first_lat"), col("first_lon"), col("last_lat"), col("last_lon")
        )

        ).withColumn(
        "time_delta", col("last_timestamp").cast("long") - col("first_timestamp").cast("long")
    ).withColumn("average_speed", col("distance") / (col("time_delta") / 3600))

)

query = (result_df.writeStream
.outputMode('append')
.option('checkpointLocation', 'content/lake/processing/vehicles_checkpoint')
.trigger(processingTime='10 seconds')
#.foreach(transform_data)
.start('content/lake/processing/vehicles/data')
)

query.awaitTermination(300)

query.stop()


will start streaming
Transformed data


## Step 6: Read and verify results

In [25]:
df = spark.read.format("parquet").load("content/lake/processing/vehicles/data")


In [26]:
df.show()

+-------+--------------------+--------------------+--------------+--------+-------+-------------------+-------------------+---------+---------+---------+---------+-----------+----------+------------------+
|     id|             trip_id|              window|current_status|route_id|stop_id|    first_timestamp|     last_timestamp|first_lat|first_lon| last_lat| last_lon|   distance|time_delta|     average_speed|
+-------+--------------------+--------------------+--------------+--------+-------+-------------------+-------------------+---------+---------+---------+---------+-----------+----------+------------------+
|43|2255|3013_0_2_2300_232...|{2025-01-17 23:34...| IN_TRANSIT_TO|  3013_0| 020081|2025-01-17 23:35:48|2025-01-17 23:35:52| 38.67038|-9.159744| 38.67054|-9.159929|0.023986982|         4|21.588283963501453|
|42|2723|2764_0_1|1|1|2310...|{2025-01-17 23:34...| IN_TRANSIT_TO|  2764_0| 071137|2025-01-17 23:35:58|2025-01-17 23:35:58|38.804203|-9.157618|38.804203|-9.157618|        0.0| 

In [27]:
final =( df.orderBy("last_timestamp", ascending=False)
        .dropDuplicates(["id"])
        .withColumn("stop", calculate_stop_location(col("stop_id")))
     .withColumn("distance_till_stop", haversine_udf(col("last_lat"), col('last_lon'), col('stop.lat'), col('stop.lon')))
     .withColumn("stop_eta", col("distance_till_stop") / col("average_speed"))


         )

In [28]:
final.show()

+-------+--------------------+--------------------+--------------+--------+-------+-------------------+-------------------+---------+---------+---------+---------+--------+----------+-------------+--------------------+------------------+--------+
|     id|             trip_id|              window|current_status|route_id|stop_id|    first_timestamp|     last_timestamp|first_lat|first_lon| last_lat| last_lon|distance|time_delta|average_speed|                stop|distance_till_stop|stop_eta|
+-------+--------------------+--------------------+--------------+--------+-------+-------------------+-------------------+---------+---------+---------+---------+--------+----------+-------------+--------------------+------------------+--------+
|41|1177|1722_0_1_2300_232...|{2025-01-17 23:34...|   INCOMING_AT|  1722_0| 120069|2025-01-17 23:35:55|2025-01-17 23:35:55|38.705444|-9.240254|38.705444|-9.240254|     0.0|         0|         NULL|{38.70538, -9.240...|       0.007317034|    NULL|
|41|1191|160