#### ingest the results json file

In [0]:

dbutils.widgets.text("data_source", "testing")
value_data_source = dbutils.widgets.get("data_source")

In [0]:
%run "../../constants/configuration"

In [0]:
%run "../../utils/common_functions"

In [0]:
from pyspark.sql.functions import current_timestamp, col, lit
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [0]:
%python
results_schema = StructType([
    StructField("resultId", IntegerType(), False),
    StructField("raceId", IntegerType(), True),
    StructField("driverId", IntegerType(), True),
    StructField("constructorId", IntegerType(), True),
    StructField("number", IntegerType(), True),
    StructField("grid", IntegerType(), True),
    StructField("position", IntegerType(), True),
    StructField("positionText", StringType(), True),
    StructField("positionOrder", IntegerType(), True),
    StructField("points", IntegerType(), True),
    StructField("laps", IntegerType(), True),
    StructField("time", StringType(), True),
    StructField("milliseconds", IntegerType(), True),
    StructField("fastestLap", IntegerType(), True),
    StructField("rank", IntegerType(), True),
    StructField("fastestLapTime", StringType(), True),
    StructField("fastestLapSpeed", FloatType(), True),
    StructField("statusId", IntegerType(), True)
])

In [0]:
results_df = spark.read \
    .schema(results_schema) \
    .json(f"{bronze_container_path}/results.json")

##### rename columns and add ingestion date

In [0]:
results_with_columns_df = results_df.withColumnRenamed("resultId", "result_id")\
                                    .withColumnRenamed("raceId", "race_id")\
                                    .withColumnRenamed("driverId", "driver_id")\
                                    .withColumnRenamed("constructorId", "constructor_id")\
                                    .withColumnRenamed("positionText", "position_text")\
                                    .withColumnRenamed("positionOrder", "position_order")\
                                    .withColumnRenamed("fastestLapTime", "fastest_lap_time")\
                                    .withColumnRenamed("fastestLapSpeed", "fastest_lap_speed")\
                                    .withColumnRenamed("fatestLap", "fastestLap")\
                                    .withColumn("data_source", lit(value_data_source))

In [0]:
results_with_ingestion_date_df = add_ingestion_date(results_with_columns_df)

##### drop column url

In [0]:
results_final_df = results_with_ingestion_date_df.drop(col("url"))

##### drop duplicates

In [0]:
results_deduped_df = results_final_df.dropDuplicates(['race_id', 'driver_id'])

In [0]:
%python
merge_condition = "tgt.result_id = src.result_id AND tgt.race_id = src.race_id"
merge_delta_data(results_deduped_df, 'silver', 'results', merge_condition, 'race_id')