In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
# importing the configuration notebook
%run "/Formula1/includes/configuration"

In [0]:
file_path = f'{raw_folder_path}/results.json'
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType, FloatType

schema = StructType( [
    StructField("resultId", IntegerType(),False),
    StructField("raceId", IntegerType(), False),
    StructField("driverId", IntegerType(), False),
    StructField ("constructorId", IntegerType(), False),
    StructField("number", IntegerType(),False),
    StructField ("grid",IntegerType(), False),
    StructField ("position", IntegerType(), False),
    StructField("positionText", StringType(),True),
    StructField ("positionOrder",IntegerType(),True),
    StructField("points", FloatType()),
    StructField("laps", IntegerType()),
    StructField ("time",StringType()),
    StructField ("milliseconds", IntegerType()),
    StructField("fastestLap", IntegerType()),
    StructField ("rank",IntegerType()),
    StructField("fastestLapTime", StringType()),
    StructField("fastestLapSpeed", DecimalType() ),
    StructField ("statusId", IntegerType())

])

In [0]:
df = spark.read.format("json").schema(schema).load(file_path)
df.show(2)

+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|resultId|raceId|driverId|constructorId|number|grid|position|positionText|positionOrder|points|laps|       time|milliseconds|fastestLap|rank|fastestLapTime|fastestLapSpeed|statusId|
+--------+------+--------+-------------+------+----+--------+------------+-------------+------+----+-----------+------------+----------+----+--------------+---------------+--------+
|       1|    18|       1|            1|    22|   1|       1|           1|            1|  10.0|  58|1:34:50.616|     5690616|        39|   2|      1:27.452|            218|       1|
|       2|    18|       2|            2|     3|   5|       2|           2|            2|   8.0|  58|     +5.478|     5696094|        41|   3|      1:27.739|            218|       1|
+--------+------+--------+-------------+------+----+--------+------------+-------------+--

In [0]:
from pyspark.sql.functions import col, current_timestamp
new_df = df.select(   col("resultId").alias("result_id"),
                      col("raceId").alias("race_id"),
                      col("driverId").alias("driver_id"),
                      col("constructorId").alias("constructor_id"),
                      "number",
                      "grid",
                      "position",
                      col("positionText").alias("position_text"),
                      col("positionOrder").alias("position_order"),
                      "points",
                      "laps",
                      "time",
                      "milliseconds",
                      col("fastestLap").alias("fastest_lap"),
                      "rank",
                      col("fastestLapTime").alias("fastest_lap_time"),
                      col("fastestLapSpeed").alias("fastest_lap_speed"),
                      current_timestamp().alias("ingestion_date")


                   )

new_df.show(2)

+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+
|result_id|race_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|       time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|      ingestion_date|
+---------+-------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+
|        1|     18|        1|             1|    22|   1|       1|            1|             1|  10.0|  58|1:34:50.616|     5690616|         39|   2|        1:27.452|              218|2023-06-20 04:16:...|
|        2|     18|        2|             2|     3|   5|       2|            2|             2|   8.0|  58|     +5.478|     5696094|         41|   3|        1:27.739|              2

In [0]:
destination_file_path = f"{processed_folder_path}/results"

new_df.write.mode("overwrite").format('parquet').partitionBy('race_id').option('path',destination_file_path).save()

In [0]:
spark.read.parquet(destination_file_path).where(col("result_id").isin([1,2,3,4,5])).show()

+---------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+-------+
|result_id|driver_id|constructor_id|number|grid|position|position_text|position_order|points|laps|       time|milliseconds|fastest_lap|rank|fastest_lap_time|fastest_lap_speed|      ingestion_date|race_id|
+---------+---------+--------------+------+----+--------+-------------+--------------+------+----+-----------+------------+-----------+----+----------------+-----------------+--------------------+-------+
|        1|        1|             1|    22|   1|       1|            1|             1|  10.0|  58|1:34:50.616|     5690616|         39|   2|        1:27.452|              218|2023-06-20 04:16:...|     18|
|        2|        2|             2|     3|   5|       2|            2|             2|   8.0|  58|     +5.478|     5696094|         41|   3|        1:27.739|              218|2023-

In [0]:
dbutils.notebook.exit("success")

success