In [0]:
# ---------- Imports ----------

from pyspark.sql.functions import col, isnull, when, to_date

In [0]:
# ---------- Set variables ----------

bronze_adls = 'abfss://bronze@datapr0ject.dfs.core.windows.net/'
silver_adls = 'abfss://silver@datapr0ject.dfs.core.windows.net/'
SPORT_ID_MAP = {"Fighting": "4443"}

In [0]:
# ---------- Load the data from bronze container ----------

df = spark.read.option("multiline", "true").json(f"{bronze_adls}2025-07-26_4443_season_data.json")

In [0]:
# ---------- Clean and filter the data ----------

correct_league_id = SPORT_ID_MAP["Fighting"]

cleaned_df = (df
    .filter((col("strSport") == "Fighting") & (col("idLeague") == correct_league_id))  
    .dropDuplicates(["idEvent"]) 
    .withColumn("dateEvent", to_date(col("dateEvent"), "yyyy-MM-dd")) 
    .withColumn("strResult", when(col("strResult").isNull(), "Unknown").otherwise(col("strResult"))) 
    .select("idEvent", "strEvent", "strPostponed", "dateEvent", "strVenue", "strCity", "strCountry", "idLeague", "strSport", "strDescriptionEN").withColumn("postponed_flag", when(col("strPostponed") == "yes", True).otherwise(False))
)

In [0]:
# ---------- Saving the cleaned data ----------

cleaned_df.write.mode("append").parquet(silver_adls)