### Ingest circuits.csv file

##### Read the CSV file using spark df reader

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
# Check the mount point
display(dbutils.fs.mounts())

In [0]:
%fs
ls /mnt/formula1dlstudy/raw

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col, current_timestamp, lit

In [0]:
circuits_schema = StructType(fields=[StructField("circuitId", IntegerType(), False), 
                                     StructField("circuitRef", StringType(), True), 
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True), 
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
                                     ])

In [0]:
circuits_df = spark.read \
.option("header", "true") \
.schema(circuits_schema) \
.csv(f"{raw_folder_path}/circuits.csv")

In [0]:
circuits_df.show()

In [0]:
display(circuits_df)

In [0]:
circuits_df.printSchema()

In [0]:
circuits_df.describe().show()

##### Select the columns

In [0]:
circuits_selected_df = circuits_df.select(col("circuitId"), col("circuitRef"), col("name"), col("location"), col("country"), col("lat"), col("lng"), col("alt"))
#circuits_selected_df = circuits_df.select("circuitId", "circuitRef", "name", "location", "country", "lat", "lng", "alt")

##### Rename the columns
1. We can use .alias with the col functions

In [0]:
circuits_renamed_df = circuits_selected_df.withColumnRenamed("circuitId", "circuit_id") \
                                          .withColumnRenamed("circuitRef", "circuit_ref") \
                                          .withColumnRenamed("lat", "latitude") \
                                          .withColumnRenamed("lng", "longitude") \
                                          .withColumnRenamed("alt", "altitude") \
                                          .withColumn("data_source", lit(v_data_source)) 

In [0]:
circuits_final_df = add_ingestion_date(circuits_renamed_df)
#Method : #circuits_final_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp())\
            #.withColumn("env", lit("Prod")) (lit is the literal value)

In [0]:
display(circuits_final_df)

Write the result as parquet file

In [0]:
#circuits_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/circuits")

In [0]:
# for SQL database
circuits_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.circuits")

- > " In the result we see part-00000 below, becuase we didn't partition the data*/"

In [0]:
#%fs
#ls /mnt/formula1dlstudy/processed/circuits 

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/circuits"))

In [0]:
%sql
SELECT * FROM f1_processed.circuits

In [0]:
dbutils.notebook.exit("Success")