In [None]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [None]:
%run "/Formula1/includes/configuration"

In [None]:
dbutils.fs.ls(raw_folder_path)

Out[1]: [FileInfo(path='dbfs:/mnt/formularacedata/raw/circuits.csv', name='circuits.csv', size=10044, modificationTime=1686570285000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/constructors.json', name='constructors.json', size=30415, modificationTime=1686570285000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/drivers.json', name='drivers.json', size=180812, modificationTime=1686570285000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/lap_times/', name='lap_times/', size=0, modificationTime=1686570362000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/pit_stops.json', name='pit_stops.json', size=1369387, modificationTime=1686570285000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/qualifying/', name='qualifying/', size=0, modificationTime=1686570383000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/races.csv', name='races.csv', size=116847, modificationTime=1686570285000),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/results.json', name='results.json', size=7165641, modificat

In [None]:
origin_file_path = f'{raw_folder_path}/constructors.json'

df = spark.read.format('json').load(origin_file_path)
df.show(4)

+-------------+--------------+----------+-----------+--------------------+
|constructorId|constructorRef|      name|nationality|                 url|
+-------------+--------------+----------+-----------+--------------------+
|            1|       mclaren|   McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber|BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|  Williams|    British|http://en.wikiped...|
|            4|       renault|   Renault|     French|http://en.wikiped...|
+-------------+--------------+----------+-----------+--------------------+
only showing top 4 rows



In [None]:
from pyspark.sql.functions import current_timestamp
new_df = df.select("constructorId", "constructorRef", "name", "nationality", current_timestamp().alias("ingestion_date"))
new_df.show(4)

+-------------+--------------+----------+-----------+--------------------+
|constructorId|constructorRef|      name|nationality|      ingestion_date|
+-------------+--------------+----------+-----------+--------------------+
|            1|       mclaren|   McLaren|    British|2023-06-16 07:08:...|
|            2|    bmw_sauber|BMW Sauber|     German|2023-06-16 07:08:...|
|            3|      williams|  Williams|    British|2023-06-16 07:08:...|
|            4|       renault|   Renault|     French|2023-06-16 07:08:...|
+-------------+--------------+----------+-----------+--------------------+
only showing top 4 rows



In [None]:
# Json comes with it's own schema inbuilt with it. 
new_df.printSchema()

root
 |-- constructorId: long (nullable = true)
 |-- constructorRef: string (nullable = true)
 |-- name: string (nullable = true)
 |-- nationality: string (nullable = true)
 |-- ingestion_date: timestamp (nullable = false)



In [None]:
# Note : with Json you will have to keep the same as column name
schema = "constructorId Int, constructorRef String, name String, nationality String"

schema_df = spark.read.format('json').schema(schema).load(origin_file_path)
schema_df.show(3)

+-------------+--------------+----------+-----------+
|constructorId|constructorRef|      name|nationality|
+-------------+--------------+----------+-----------+
|            1|       mclaren|   McLaren|    British|
|            2|    bmw_sauber|BMW Sauber|     German|
|            3|      williams|  Williams|    British|
+-------------+--------------+----------+-----------+
only showing top 3 rows



In [None]:
final_df = new_df.withColumnRenamed("constructorId","constructor_id").withColumnRenamed("constructorRef","constructor_ref")
final_df.show(3)

+--------------+---------------+----------+-----------+--------------------+
|constructor_id|constructor_ref|      name|nationality|      ingestion_date|
+--------------+---------------+----------+-----------+--------------------+
|             1|        mclaren|   McLaren|    British|2023-06-16 07:08:...|
|             2|     bmw_sauber|BMW Sauber|     German|2023-06-16 07:08:...|
|             3|       williams|  Williams|    British|2023-06-16 07:08:...|
+--------------+---------------+----------+-----------+--------------------+
only showing top 3 rows



In [None]:
dest_path = f'{processed_folder_path}/constructors'
final_df.write.mode('overwrite').format('parquet').option('path',dest_path).save()

In [None]:
dbutils.notebook.exit("success")