<a href="https://colab.research.google.com/github/PedroTechy/CarrisInsight/blob/streaming_development/spark_jobs/extract_carris_vehicles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Authenticate with Google Cloud


In [14]:
!gcloud auth application-default login

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=pL6XOyO1TgeSp7596YtHa4AcZbx5fI&prompt=consent&token_usage=remote&access_type=offline&code_challenge=ePPILVVfpRstIFVNfbMlXMj9kJ8SHDu5PpI92Mt0SOc&code_challenge_method=S256

Once finished, enter the verification code provided in your browser: 4/0AanRRrvkoqgFZqB230wZTvH46K0uKjiuUxJtymcDVx-2NL5l_lWiNxCDPQ2u0d4OOQIQ5w

Credentials saved to file: [/content/.config/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).
Ca

# Step 2: Install Spark and BigQuery connector

In [6]:
!pip install pyspark



In [5]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import min, max, first, last, col, window, from_unixtime, to_timestamp


In [3]:
spark = SparkSession.builder \
    .appName('pyspark-run-with-gcp-bucket') \
    .config("spark.jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()

spark._jsc.hadoopConfiguration().set("google.cloud.auth.service.account.json.keyfile", "/content/.config/application_default_credentials.json")

# Step 3: Set environment variables for Spark and Java

# Step 4: Initialize Spark session

# Step 5: Define GCS input path and output BigQuery table

In [1]:
input_path = "gs://edit-de-project-streaming-data/carris-vehicles"
output_table = "data-eng-dev-437916.data_eng_project_group3_raw"

In [6]:
!rm -rf content/lake/processing/

# Step 6: Read streaming data from GCS

In [8]:
# Define the schema for your JSON files

schema = StructType([
    StructField("bearing", FloatType(), True),
    StructField("block_id", StringType(), True),
    StructField("current_status", StringType(), True),
    StructField("id", StringType(), True),
    StructField("lat", FloatType(), True),
    StructField("line_id", StringType(), True),
    StructField("lon", FloatType(), True),
    StructField("pattern_id", StringType(), True),
    StructField("route_id", StringType(), True),
    StructField("schedule_relationship", StringType(), True),
    StructField("shift_id", StringType(), True),
    StructField("speed", FloatType(), True),
    StructField("stop_id", StringType(), True),
    StructField("timestamp", LongType(), True),
    StructField("trip_id", StringType(), True)
])

In [25]:

def transform_data(df, batch_id):

  transformed_df = df.withColumn("timestamp", to_timestamp(from_unixtime("timestamp"))).withWatermark("timestamp", "10 minutes")
  # Write a df with the datetype transform only
  transformed_df.write.mode("append").format("parquet").save("content/lake/processing/vehicles/tests")

  window_spec = window("timestamp", "2 minutes", "5 seconds")
  # Group by vehicle ID and window, then get the first and last timestamps and lat/lon values
  result_df = (
      transformed_df.groupBy("id", window_spec)
      .agg(
          min("timestamp").alias("first_timestamp"),
          max("timestamp").alias("last_timestamp"),
          first("lat").alias("first_lat"),
          first("lon").alias("first_lon"),
          last("lat").alias("last_lat"),
          last("lon").alias("last_lon")
      ).orderBy("last_timestamp", ascending=False).dropDuplicates(["id"])
      )
  result_df.write.mode("append").format("parquet").save("content/lake/processing/vehicles/data")
  print(result_df.count())
  return result_df



In [11]:
import math
def haversine(lat1, lon1, lat2, lon2):
    try:
      # Earth radius in kilometers
      R = 6371.0

      # Convert latitude and longitude from degrees to radians
      lat1_rad, lon1_rad = math.radians(lat1), math.radians(lon1)
      lat2_rad, lon2_rad = math.radians(lat2), math.radians(lon2)

      # Differences
      delta_lat = lat2_rad - lat1_rad
      delta_lon = lon2_rad - lon1_rad

      # Haversine formula
      a = math.sin(delta_lat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(delta_lon / 2)**2
      c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

      # Distance
      distance = R * c
      return distance
    except:
      return 0

# Example usage
lat1, lon1 = 52.2296756, 21.0122287  # Warsaw
lat2, lon2 = 41.8919300, 12.5113300  # Rome
distance = haversine(lat1, lon1, lat2, lon2)

In [42]:
df = (spark.readStream.option("maxFilesPerTrigger", 1)
    .format("json")
    .schema(schema)
    .load(input_path))


query = (df.writeStream
.outputMode('append')
.option('checkpointLocation', 'content/lake/processing/vehicles_checkpoint') #using a common checkpoint for both messages streams.
.trigger(processingTime='5 seconds')
.foreachBatch(transform_data)
.start()
)

query.awaitTermination(120)

query.stop()


1133
1127
1120


ERROR:py4j.clientserver:There was an exception while executing the Python Proxy on the Python Side.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 617, in _call_proxy
    return_value = getattr(self.pool[obj_id], method)(*params)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pyspark/sql/utils.py", line 120, in call
    raise e
  File "/usr/local/lib/python3.11/dist-packages/pyspark/sql/utils.py", line 117, in call
    self.func(DataFrame(jdf, wrapped_session_jdf), batch_id)
  File "<ipython-input-25-f94e2fdfd712>", line 20, in transform_data
    result_df.write.mode("append").format("parquet").save("content/lake/processing/vehicles/data")
  File "/usr/local/lib/python3.11/dist-packages/pyspark/sql/readwriter.py", line 1463, in save
    self._jwrite.save(path)
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1322, in __call__
    ret

In [43]:
df = spark.read.format("parquet").load("content/lake/processing/vehicles/data")


In [44]:
tests = spark.read.format("parquet").load("content/lake/processing/vehicles/tests")


In [45]:
df.count()

4521

In [46]:
tests.count()

12476

In [47]:
# prompt: create a new column that applies haversine to each row to the values of columns first_lat, first_lon, last_lat and last_lon

from pyspark.sql.functions import udf
import pyspark.sql.functions as F
from pyspark.sql.types import FloatType,   StringType, IntegerType


haversine_udf = F.udf(haversine, FloatType())

df = df.withColumn("distance", haversine_udf(F.col("first_lat"), F.col("first_lon"), F.col("last_lat"), F.col("last_lon")))

# Apply the UDF to create the new column


In [48]:
window_spec = window("timestamp", "2 minutes")
  # Group by vehicle ID and window, then get the first and last timestamps and lat/lon values
result_df = (tests.groupBy("id", window_spec)
      .agg(
          min("timestamp").alias("first_timestamp"),
          max("timestamp").alias("last_timestamp"),
          first("lat").alias("first_lat"),
          first("lon").alias("first_lon"),
          last("lat").alias("last_lat"),
          last("lon").alias("last_lon")
      ).orderBy("last_timestamp", ascending=False).dropDuplicates(["id"]).withColumn("distance", haversine_udf(F.col("first_lat"), F.col("first_lon"), F.col("last_lat"), F.col("last_lon")))
      )

In [49]:
df.filter(df["first_lat"] != df["last_lat"]).show()

+---+------+---------------+--------------+---------+---------+--------+--------+--------+
| id|window|first_timestamp|last_timestamp|first_lat|first_lon|last_lat|last_lon|distance|
+---+------+---------------+--------------+---------+---------+--------+--------+--------+
+---+------+---------------+--------------+---------+---------+--------+--------+--------+



In [50]:
result_df.filter(result_df["first_lat"] != result_df["last_lat"]).show()

+-------+--------------------+-------------------+-------------------+---------+---------+---------+---------+------------+
|     id|              window|    first_timestamp|     last_timestamp|first_lat|first_lon| last_lat| last_lon|    distance|
+-------+--------------------+-------------------+-------------------+---------+---------+---------+---------+------------+
|41|1100|{2025-01-17 08:48...|2025-01-17 08:48:02|2025-01-17 08:48:25|38.721485|-9.202832|38.722572|-9.199898|    0.281819|
|41|1102|{2025-01-17 08:48...|2025-01-17 08:48:10|2025-01-17 08:48:52| 38.71482|-9.241849|38.716415|-9.239246|  0.28709525|
|41|1103|{2025-01-17 08:48...|2025-01-17 08:48:14|2025-01-17 08:48:51|38.726063|-9.238774|38.726112|-9.236004|  0.24039264|
|41|1105|{2025-01-17 08:48...|2025-01-17 08:48:00|2025-01-17 08:48:57|38.725433|-9.310746|38.725048|-9.309947|  0.08149729|
|41|1107|{2025-01-17 08:48...|2025-01-17 08:48:18|2025-01-17 08:48:41|38.700886|-9.227971|38.700726|-9.227794|  0.02354433|
|41|1108

In [None]:
# prompt: convert the column timestamp to timestamp

from pyspark.sql.functions import from_unixtime, to_timestamp


In [None]:
# prompt: group by vehicle in 2 minutes buckets

from pyspark.sql.functions import window, from_unixtime, col, to_timestamp

# Assuming 'vehicles_table' is your table with the vehicle data

# Read the data
vehicle_data = spark.read.parquet("vehicles_table")

# Convert the timestamp to a timestamp type and adjust timezone
vehicle_data = vehicle_data.withColumn("timestamp_readable", to_timestamp(from_unixtime(col("timestamp") / 1000)))

# Group by vehicle and 2-minute intervals
grouped_vehicle_data = vehicle_data.groupBy(
    "id", window("timestamp_readable", "2 minutes")
).count()


# Show the results
grouped_vehicle_data.show(truncate=False)

# Optional: Write the grouped data to a new location
# grouped_vehicle_data.write.mode("overwrite").parquet("grouped_vehicles_data")

In [None]:
  spark.read.table("vehicles_table").withColumn("timestamp", to_timestamp(from_unixtime("timestamp"))).show()

+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|bearing|            block_id|current_status|      id|      lat|line_id|      lon|pattern_id|route_id|schedule_relationship|    shift_id|    speed|stop_id|          timestamp|             trip_id|
+-------+--------------------+--------------+--------+---------+-------+---------+----------+--------+---------------------+------------+---------+-------+-------------------+--------------------+
|  183.0|20250116-64010165...| IN_TRANSIT_TO|44|12745| 38.76761|   4720|-9.100393|  4720_0_1|  4720_0|            SCHEDULED|113260234560|      0.0| 060011|2025-01-16 11:16:53|4720_0_1|2700|111...|
|   58.0|20250116-64010273...| IN_TRANSIT_TO|44|12557| 38.57138|   4641| -9.03899|  4641_0_2|  4641_0|            SCHEDULED|111500234560|      0.0| 150013|2025-01-16 11:16:43|4641_0_2|2700|102...|
|   12.0|202501

# Step 7: Write streaming data to BigQuery with auto-table creation

In [None]:
streaming_query.stop()