<a href="https://colab.research.google.com/github/PedroTechy/DataProcessingEdit/blob/main/spark/challenges_answers/etl_program_challange.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#As first mentioned i need to create the lake

!mkdir -p /content/lake/bronze

!mkdir -p /content/lake/silver

!mkdir -p /content/lake/gold

In [None]:
import requests

api_url = "https://api.carrismetropolitana.pt/municipalities"
response = requests.get(api_url)

import requests
import json

if response.status_code == 200:
    data = response.json()
    output_file = "api_response.json"
    with open(output_file, 'w') as file:
        json.dump(data, file, indent=4)
    print(f"API response saved to {output_file}")
else:
    print(f"Failed to fetch data from API. Status code: {response.status_code}")


API response saved to api_response.json


In [19]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests


class ETLFlow:
    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

    def extract_from_file(self, format: str, path: str, **kwargs) -> DataFrame:
        df = self.spark.read.format(format).load(path)
        return df

    def extract_from_api(self, url: str, schema: StructType = None):
      response = requests.get(url)
      rdd = spark.sparkContext.parallelize(response.json())
      if schema:
        df = spark.read.schema(schema).json(rdd)
      else:
        df = spark.read.json(rdd)
      return df

    def load(self, df: DataFrame, format: str, path: str, **kwargs) -> None:
        df.write.mode("overwrite").format(format).save(path)

class ETLTask(ETLFlow):



    def __init__(self, spark: SparkSession) -> None:
        self.spark = spark

        self.vehicle_schema = StructType([StructField('bearing', IntegerType(), True),
                                  StructField('block_id', StringType(), True),
                                  StructField('current_status', StringType(), True),
                                  StructField('id', StringType(), True),
                                  StructField('lat', FloatType(), True),
                                  StructField('line_id', StringType(), True),
                                  StructField('lon', FloatType(), True),
                                  StructField('pattern_id', StringType(), True),
                                  StructField('route_id', StringType(), True),
                                  StructField('schedule_relationship', StringType(), True),
                                  StructField('shift_id', StringType(), True),
                                  StructField('speed', FloatType(), True),
                                  StructField('stop_id', StringType(), True),
                                  StructField('timestamp', TimestampType(), True),
                                  StructField('trip_id', StringType(), True)])

        self.lines_schema = StructType([
                          StructField("color", StringType(), True),
                          StructField("facilities", ArrayType(StringType()), True),
                          StructField("id", StringType(), True),
                          StructField("localities", ArrayType(StringType()), True),
                          StructField("long_name", StringType(), True),
                          StructField("municipalities", ArrayType(StringType()), True),
                          StructField("patterns", ArrayType(StringType()), True),
                          StructField("routes", ArrayType(StringType()), True),
                          StructField("short_name", StringType(), True),
                          StructField("text_color", StringType(), True)
                      ])

        self.municipalities_schema = StructType([
                                      StructField("district_id", StringType(), True),
                                      StructField("district_name", StringType(), True),
                                      StructField("id", StringType(), True),
                                      StructField("name", StringType(), True),
                                      StructField("prefix", StringType(), True),
                                      StructField("region_id", StringType(), True),
                                      StructField("region_name", StringType(), True)
                                  ])

    def ingestion_vehicles(self):

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/vehicles", schema=self.vehicle_schema)

      # create the 'date' column extracted from the 'timestamp' column / from
      df = df.withColumn("date", date_format(col("timestamp"), "HHmmss"))

      # partition the DataFrame by the 'date' column
      partitioned_path = "/content/lake/bronze/vehicles"

      # save DataFrame as a single Parquet file, partitioned by 'date'
      df.repartition(1).write.partitionBy("date").format("parquet").mode("overwrite").save(partitioned_path)


    def ingestion_lines(self):
      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/lines", schema=self.lines_schema)
      self.load(df=df, format="parquet", path="/content/lake/bronze/lines")

    def ingestion_municipalities(self):

      df = self.extract_from_api(url="https://api.carrismetropolitana.pt/municipalities", schema=self.municipalities_schema)
      self.load(df=df, format="parquet", path="/content/lake/bronze/municipalities")


    def cleansing_vehicles(self):

      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/vehicles")

      #remove possible duplicates
      df = df.drop_duplicates()

      #remove any corrupted record - here i assume correcuted rows wwere rows with only null values
      df = df.dropna(how='all')
      #rename "lat" and "lon" to "latitude" and "longitude" respectively
      df = df.withColumnRenamed("lat","latitude")
      df = df.withColumnRenamed("lon","longitude")

      #remove rows when the column CURRENT_STATUS is null
      df = df.dropna(subset=["CURRENT_STATUS"])

      self.load(df=df, format="parquet", path="/content/lake/silver/vehicles")

    def cleansing_lines(self):

      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/lines")

      #remove possible duplicates
      df = df.drop_duplicates()

      #remove any corrupted record - here i assume correcuted rows wwere rows with only null values
      df.dropna(how = 'all')

      #remove rows when the column LONG_NAME is null
      df = df.dropna(subset=["LONG_NAME"])

      #df.show()

      self.load(df=df, format="parquet", path="/content/lake/silver/lines")

    def cleansing_municipalaties(self):
      df = self.extract_from_file(format="parquet", path="/content/lake/bronze/municipalities")

      #remove possible duplicates
      df = df.drop_duplicates()

      #remove any corrupted record - here i assume correcuted rows wwere rows with only null values
      df.dropna(how = 'all')

      #remove rows when the column NAME is null
      df = df.dropna(subset=["NAME"])
      #remove rows when the column NAME is null

      df = df.dropna(subset=["DISTRICT_NAME"])

      self.load(df=df, format="parquet", path="/content/lake/silver/municipalities")

    def enrich(self):

       vehicles = spark.read.schema(self.vehicle_schema).parquet("/content/lake/bronze/vehicles")
       lines = spark.read.schema(self.lines_schema).parquet("/content/lake/bronze/lines")
       municipalities = spark.read.schema(self.municipalities_schema).parquet("/content/lake/bronze/municipalities")

       # explode the municipalities array in the lines DataFrame to performe join later
       lines_exploded = lines.withColumn("municipality_id", explode(lines["municipalities"]))

       vehicles_lines = vehicles.join(lines_exploded, vehicles["line_id"] == lines_exploded["id"], "left")
       vehicles_lines_municipalities = vehicles_lines.join(municipalities, vehicles_lines["municipality_id"] == municipalities["id"], "left")

       result_df = vehicles_lines_municipalities.select(
          vehicles["*"],  # Select all columns from vehicles
          lines_exploded["long_name"].alias("line_name"),
          municipalities["name"].alias("municipality_name")
       )
       #result_df.show()

       result_df.repartition(1).write.partitionBy("date").format("parquet").mode("overwrite").save("/content/lake/gold/vehicles_enriched")

    def ch


if __name__ == '__main__':

    # init spark
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.master('local').appName('ETL Program').getOrCreate()
    spark.conf.set("spark.sql.sources.partitionOverwriteMode","dynamic") # enable dynamic partition overwrite - only overwrites partitions that are coming in the dataframe

    print("Starting ETL program")
    etl = ETLTask(spark)
    # run tasks
    print("Running Task - Ingestion Vehicles")
    etl.ingestion_vehicles()

    print("Running Task - Ingestion Lines")
    etl.ingestion_lines()

    print("Running Task - Ingestion Municipalities")
    etl.ingestion_municipalities()

    print("Running Task - Cleansing Vehicles")
    etl.cleansing_vehicles()

    print("Running Task - Cleansing Lines")
    etl.cleansing_lines()

    print("Running Task - Cleansing Municipalaties")
    etl.cleansing_municipalaties()

    print("Running Task - Enriching phase")
    etl.enrich()

    print("ETL program completed")

Starting ETL program
Running Task - Ingestion Vehicles
Running Task - Ingestion Lines
Running Task - Ingestion Municipalities
Running Task - Cleansing Vehicles
Running Task - Cleansing Lines
+-------+----------+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+
|  color|facilities|  id|          localities|           long_name|      municipalities|            patterns|              routes|short_name|text_color|
+-------+----------+----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+
|#C61D23|        []|2115|             [Mafra]|Codeçal (Tapada N...|              [1109]|[2115_0_1, 2115_0_2]|            [2115_0]|      2115|   #FFFFFF|
|#C61D23|        []|2532|[Alverca, Vila Fr...|Alverca(Est) - Lo...|        [1114, 1107]|[2532_0_1, 2532_0_2]|            [2532_0]|      2532|   #FFFFFF|
|#C61D23|        []|3119|[Pinhal de Cima, ..

In [None]:
# check results
spark.read.parquet("/content/lake/bronze/lines").show()
spark.read.parquet("/content/lake/silver/lines").show()