In [0]:
%run "../set-up/global_variables"

In [0]:
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType, BooleanType, DecimalType
from pyspark.sql.functions import col, current_timestamp, when, lit

In [0]:
# We don't need to loop through schema definition; we trust each JSON file is formatted consistently
league_info_raw_as_string = dbutils.fs.head('/mnt/sleeperprojectdl/raw/2022/league_info.json')
league_info_raw_as_dict = json.loads(league_info_raw_as_string)

scoring_settings_schema = \
    StructType([StructField(key, FloatType(), True) for key in league_info_raw_as_dict['scoring_settings'].keys()])

league_settings_schema = \
    StructType([StructField(key, IntegerType(), True) for key in league_info_raw_as_dict['settings'].keys()])

metadata_schema = \
    StructType([StructField(key, StringType(), True) for key in league_info_raw_as_dict['metadata'].keys()])

league_info_schema = StructType([
    StructField('league_id', DecimalType(38, 0), True),
    StructField('name', StringType(), True),
    StructField('season', StringType(), True),
    StructField('status', StringType(), True),
    StructField('previous_league_id', DecimalType(38, 0), True),
    StructField('draft_id', DecimalType(38, 0), True),
    StructField('total_rosters', IntegerType(), True),
    StructField('bracket_id', DecimalType(38, 0), True),
    StructField('loser_bracket_id', DecimalType(38, 0), True),
    StructField('roster_positions', ArrayType(StringType()), True),
    StructField('group_id', StringType(), True),
    StructField('last_read_id', StringType(), True),
    StructField('last_pinned_message_id', StringType(), True),
    StructField('last_message_time', StringType(), True),
    StructField('last_message_text_map', StringType(), True),
    StructField('last_message_attachment', StringType(), True),
    StructField('last_author_is_bot', BooleanType(), True),
    StructField('last_author_id', StringType(), True),
    StructField('last_author_display_name', StringType(), True),
    StructField('last_author_avatar', StringType(), True),
    StructField('last_message_id', StringType(), True),
    StructField('shard', IntegerType(), True),
    StructField('sport', StringType(), True),
    StructField('season_type', StringType(), True),
    StructField('scoring_settings', scoring_settings_schema, True),
    StructField('company_id', StringType(), True),
    StructField('avatar', StringType(), True),
    StructField('settings', league_settings_schema, True),
    StructField('metadata', metadata_schema, True)
])

In [0]:
# Create empty list of DataFrames
dfs_to_union = []

# Iterate through all the league_info files
for season in ALL_SEASONS.keys():
    file_path = f"/mnt/sleeperprojectdl/raw/{season}/league_info.json"

    # Check if file path exists
    if dbutils.fs.ls(file_path):
        league_info_df = spark.read.json(file_path, multiLine=True, schema=league_info_schema)

        # Select columns we only need, add ingestion date
        league_info_final_df = league_info_df \
        .withColumnRenamed('bracket_id', 'playoff_bracket_id') \
        .select(
            col('league_id'),
            col('name'),
            col('season'),
            col('settings.trade_deadline'),
            col('settings.playoff_week_start'),
            col('settings.playoff_teams'),
            col('status'),
            col('previous_league_id'),
            col('draft_id'),
            col('settings.num_teams'),
            col('playoff_bracket_id'),
            col('loser_bracket_id'),
            col('metadata.latest_league_winner_roster_id')
        ) \
        .withColumnRenamed('latest_league_winner_roster_id', 'champion') \
        .withColumn('champion', when(col('status') == 'in_season', lit(None)).otherwise(col('champion'))) \
        .withColumn('ingestion_date', current_timestamp()) 

        # Add the final DataFrame to dfs_to_union list
        dfs_to_union.append(league_info_final_df)
    
    else:
        print(f"File not found: {file_path}")

In [0]:
# Create stacked DataFrame by first initializing first DataFrame in list
all_league_info_df = dfs_to_union[0]

# Iterate over the remaining DataFrames and stack them
for df in dfs_to_union[1:]:
    all_league_info_df = all_league_info_df.union(df)

display(all_league_info_df)

league_id,name,season,trade_deadline,playoff_week_start,playoff_teams,status,previous_league_id,draft_id,num_teams,playoff_bracket_id,loser_bracket_id,champion,ingestion_date
1048353521130741760,Tyler’s Golden Age,2024,13,15,6,in_season,9.17263521006592e+17,1048353521130741761,10,,,,2024-07-09T01:19:35.64Z
917263521006592000,Tyler’s Golden Age,2023,13,15,6,complete,8.335859499958149e+17,917263521006592001,10,1.0458562477879008e+18,1.0458562477962895e+18,8.0,2024-07-09T01:19:35.64Z
833585949995814912,Tyler’s Golden Age,2022,13,15,6,complete,,833585950700474368,10,9.169180108484936e+17,9.169180108526879e+17,1.0,2024-07-09T01:19:35.64Z


In [0]:
# Write unionized DataFrame to Data Lake as a parquet file
all_league_info_df.write.mode("overwrite").parquet("/mnt/sleeperprojectdl/processed/league_info")

In [0]:
%fs
ls /mnt/sleeperprojectdl/processed/

path,name,size,modificationTime
dbfs:/mnt/sleeperprojectdl/processed/2022/,2022/,0,1719970284000
dbfs:/mnt/sleeperprojectdl/processed/2023/,2023/,0,1719970283000
dbfs:/mnt/sleeperprojectdl/processed/2024/,2024/,0,1719970278000
dbfs:/mnt/sleeperprojectdl/processed/league_info/,league_info/,0,1720487977000


In [0]:
display(spark.read.parquet("/mnt/sleeperprojectdl/processed/league_info"))

league_id,name,season,trade_deadline,playoff_week_start,playoff_teams,status,previous_league_id,draft_id,num_teams,playoff_bracket_id,loser_bracket_id,champion,ingestion_date
917263521006592000,Tyler’s Golden Age,2023,13,15,6,complete,8.335859499958149e+17,917263521006592001,10,1.0458562477879008e+18,1.0458562477962895e+18,8.0,2024-07-09T01:19:36.752Z
833585949995814912,Tyler’s Golden Age,2022,13,15,6,complete,,833585950700474368,10,9.169180108484936e+17,9.169180108526879e+17,1.0,2024-07-09T01:19:36.752Z
1048353521130741760,Tyler’s Golden Age,2024,13,15,6,in_season,9.17263521006592e+17,1048353521130741761,10,,,,2024-07-09T01:19:36.752Z
