In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "/Formula1/includes/configuration"

In [0]:
dbutils.fs.ls(raw_folder_path)

Out[5]: [FileInfo(path='dbfs:/mnt/formularacedata/raw/circuits.csv', name='circuits.csv', size=10044, modificationTime=1686570285000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/constructors/', name='constructors/', size=0, modificationTime=1686899480000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/constructors.json', name='constructors.json', size=30415, modificationTime=1686570285000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/drivers.json', name='drivers.json', size=180812, modificationTime=1686570285000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/lap_times/', name='lap_times/', size=0, modificationTime=1686570362000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/pit_stops.json', name='pit_stops.json', size=1369387, modificationTime=1686925342000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/qualifying/', name='qualifying/', size=0, modificationTime=1686570383000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/races.csv', name='races.csv', size=116847, modificationT

Out[2]: [FileInfo(path='dbfs:/mnt/formularacedata/presentation/', name='presentation/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/formularacedata/processed/', name='processed/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/', name='raw/', size=0, modificationTime=0)]

In [0]:
origin_file_path = f'{raw_folder_path}/constructors.json'

df = spark.read.format('json').load(origin_file_path)
df.show(4)

+-------------+--------------+----------+-----------+--------------------+
|constructorId|constructorRef|      name|nationality|                 url|
+-------------+--------------+----------+-----------+--------------------+
|            1|       mclaren|   McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber|BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|  Williams|    British|http://en.wikiped...|
|            4|       renault|   Renault|     French|http://en.wikiped...|
+-------------+--------------+----------+-----------+--------------------+
only showing top 4 rows



In [0]:
from pyspark.sql.functions import current_timestamp
new_df = df.select("constructorId", "constructorRef", "name", "nationality", current_timestamp().alias("ingestion_date"))
new_df.show(4)

+-------------+--------------+----------+-----------+--------------------+
|constructorId|constructorRef|      name|nationality|      ingestion_date|
+-------------+--------------+----------+-----------+--------------------+
|            1|       mclaren|   McLaren|    British|2023-06-29 19:15:...|
|            2|    bmw_sauber|BMW Sauber|     German|2023-06-29 19:15:...|
|            3|      williams|  Williams|    British|2023-06-29 19:15:...|
|            4|       renault|   Renault|     French|2023-06-29 19:15:...|
+-------------+--------------+----------+-----------+--------------------+
only showing top 4 rows



In [0]:
# Json comes with it's own schema inbuilt with it. 
new_df.printSchema()

root
 |-- constructorId: long (nullable = true)
 |-- constructorRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)



In [0]:
# Note : with Json you will have to keep the same as column name
schema = "constructorId Int, constructorRef String, name String, nationality String"

schema_df = spark.read.format('json').schema(schema).load(origin_file_path)
schema_df.show(3)

+-------------+--------------+----------+-----------+
|constructorId|constructorRef|      name|nationality|
+-------------+--------------+----------+-----------+
|            1|       mclaren|   McLaren|    British|
|            2|    bmw_sauber|BMW Sauber|     German|
|            3|      williams|  Williams|    British|
+-------------+--------------+----------+-----------+
only showing top 3 rows



In [0]:
final_df = new_df.withColumnRenamed("constructorId","constructor_id").withColumnRenamed("constructorRef","constructor_ref")
final_df.show(3)

+--------------+---------------+----------+-----------+--------------------+
|constructor_id|constructor_ref|      name|nationality|      ingestion_date|
+--------------+---------------+----------+-----------+--------------------+
|             1|        mclaren|   McLaren|    British|2023-06-29 19:15:...|
|             2|     bmw_sauber|BMW Sauber|     German|2023-06-29 19:15:...|
|             3|       williams|  Williams|    British|2023-06-29 19:15:...|
+--------------+---------------+----------+-----------+--------------------+
only showing top 3 rows



In [0]:
final_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.constructors")


In [0]:
%sql 
SELECT * FROM f1_processed.constructors LIMIT 3

constructor_id,constructor_ref,name,nationality,ingestion_date
1,mclaren,McLaren,British,2023-06-29T19:15:40.984+0000
2,bmw_sauber,BMW Sauber,German,2023-06-29T19:15:40.984+0000
3,williams,Williams,British,2023-06-29T19:15:40.984+0000


In [0]:
dbutils.notebook.exit("success")

success