#### Inject circuit.csv file

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

## Databricks Workflow
##### invocking from notebook from another 

In [0]:
%run "../includes/common_functions"

#### step1- Read the csv file using the spark dataframe reader

In [0]:
from pyspark.sql.types import StringType, StructField, StructType, IntegerType, DoubleType

In [0]:
circuit_schema = StructType(fields= [StructField("circuitId", IntegerType(),False),
                                     StructField("circuitRef", StringType(),True),
                                     StructField("name", StringType(),True),
                                     StructField("location", StringType(),True),
                                     StructField("country", StringType(),True),
                                     StructField("lat", DoubleType(),True),
                                     StructField("lng", DoubleType(),True),
                                     StructField("alt", DoubleType(),True),
                                     StructField("url", StringType(),True)
                                     

])

In [0]:
circuit_df= spark.read\
    .option("header", True)\
    .schema(circuit_schema)\
    .csv(f"{raw_folder_path}/{v_file_date}/circuits.csv")
   #.option("inferSchema", True)\


###### select only the required columns

In [0]:

# circuit_selected_df = circuit_df.select("circuitid","circuitRef","name","location","country","lat","lng","alt")

# circuit_selected_df = circuit_df.select(circuit_df.circuitid,circuit_df.circuitRef,circuit_df.name,circuit_df.location,circuit_df.country,circuit_df.lat,circuit_df.lng,circuit_df.alt)

# circuit_selected_df = circuit_df.select(circuit_df["circuitid"],circuit_df["circuitRef"],circuit_df["name"],circuit_df["location"],circuit_df["country"],circuit_df["lat"],circuit_df["lng"],circuit_df["alt"])

In [0]:
from pyspark.sql.functions import col

In [0]:
circuit_selected_df = circuit_df.select(col("circuitId"),col("circuitRef"),col("name"),col("location"),col("country"),col("lat"),col("lng"),col("alt"))

In [0]:
display(circuit_selected_df)

circuitId,circuitRef,name,location,country,lat,lng,alt
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10.0
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18.0
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7.0
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109.0
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130.0
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7.0
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13.0
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228.0
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153.0
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103.0


In [0]:
# circuit_selected_df = circuit_df.select(col("circuitid").alias("circuit_id"),col("circuitRef").alias("circuit_ref"),
   #                                      col("name"),col("location"),col("country"),col("lat").alias("latitude"),col("lng").alias("longitude"),col("alt").alias("altitude"))

In [0]:
from pyspark.sql.functions import lit

##### rename columns

In [0]:
circuit_renamed_df = circuit_selected_df.withColumnRenamed("circuitId", "circuit_id")\
    .withColumnRenamed("circuitRef", "circuit_ref")\
    .withColumnRenamed("lat", "latitude")\
    .withColumnRenamed("lng", "longitude")\
    .withColumnRenamed("alt", "altitude")\
    .withColumn("data_source", lit(v_data_source))\
        .withColumn("file_date", lit(v_file_date))

In [0]:
display(circuit_renamed_df)

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,file_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10.0,testing,2021-03-21
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18.0,testing,2021-03-21
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7.0,testing,2021-03-21
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109.0,testing,2021-03-21
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130.0,testing,2021-03-21
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7.0,testing,2021-03-21
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13.0,testing,2021-03-21
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228.0,testing,2021-03-21
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153.0,testing,2021-03-21
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103.0,testing,2021-03-21


##### Add Injection date to the dataframe

In [0]:
circuit_final_df= add_injection_date(circuit_renamed_df)

In [0]:
display(circuit_final_df)

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,file_date,injestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10.0,testing,2021-03-21,2024-04-21T14:38:02.994Z
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18.0,testing,2021-03-21,2024-04-21T14:38:02.994Z
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7.0,testing,2021-03-21,2024-04-21T14:38:02.994Z
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109.0,testing,2021-03-21,2024-04-21T14:38:02.994Z
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130.0,testing,2021-03-21,2024-04-21T14:38:02.994Z
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7.0,testing,2021-03-21,2024-04-21T14:38:02.994Z
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13.0,testing,2021-03-21,2024-04-21T14:38:02.994Z
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228.0,testing,2021-03-21,2024-04-21T14:38:02.994Z
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153.0,testing,2021-03-21,2024-04-21T14:38:02.994Z
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103.0,testing,2021-03-21,2024-04-21T14:38:02.994Z


#### the initial processed data was saved in a dataframe

circuit_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/circuits")

In [0]:
# circuit_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.circuits")

converting to a delta Table

In [0]:
circuit_final_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.circuits")

In [0]:
%sql

select * from f1_processed.circuits

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,file_date,injestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103.0,testing,2021-03-21,2024-04-21T14:38:03.572Z


In [0]:
df =spark.read.format("delta").load(f"{processed_folder_path}/circuits")

In [0]:
display(df)

circuit_id,circuit_ref,name,location,country,latitude,longitude,altitude,data_source,file_date,injestion_date
1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
9,silverstone,Silverstone Circuit,Silverstone,UK,52.0786,-1.01694,153.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
10,hockenheimring,Hockenheimring,Hockenheim,Germany,49.3278,8.56583,103.0,testing,2021-03-21,2024-04-21T14:38:03.572Z
