In [0]:
%run "../set-up/global_variables"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DecimalType, ArrayType, MapType, DoubleType
from pyspark.sql.functions import col, explode, current_timestamp, lit, concat
import json

In [0]:
rosters_raw_as_string = dbutils.fs.head('/mnt/sleeperprojectdl/raw/2022/rosters.json')
rosters_raw_as_list = json.loads(rosters_raw_as_string)

settings_schema = StructType(
    [StructField(setting, IntegerType(), True) for setting in rosters_raw_as_list[0]['settings'].keys()]
)

rosters_schema = StructType([
    StructField('taxi', ArrayType(StringType()), True),
    StructField('starters', ArrayType(StringType()), True),
    StructField('settings', settings_schema, True),
    StructField('roster_id', IntegerType(), True),
    StructField('reserve', ArrayType(StringType()), True),
    StructField('players', ArrayType(StringType()), True),
    StructField('player_map', StringType(), True),
    StructField('owner_id', DecimalType(38,0), True),
    StructField('metadata', 
        StructType([
            StructField('streak', StringType(), True),
            StructField('record', StringType(), True)]),
        True),
    StructField('league_id', DecimalType(38,0), True),
    StructField('keepers', StringType(), True),
    StructField('co_owners', StringType(), True),
])

In [0]:
# Iterate through all the league_info files
for season in ALL_SEASONS.keys():
    file_path = f"/mnt/sleeperprojectdl/raw/{season}/rosters.json"

    # Check if file path exists
    if dbutils.fs.ls(file_path):
        rosters_df = spark.read.json(file_path, schema=rosters_schema, multiLine=True)

        # rosters_final_df = rosters_df.select(
        #         col('owner_id'),
        #         col('roster_id'),
        #         explode(col('players')).alias('player_id')
        #     ) \
        #     .withColumn('player_id', col('player_id').cast(IntegerType())) \
        #     .withColumn('ingestion_date', current_timestamp())

        # rosters_final_df.write.mode('overwrite').parquet(f"/mnt/sleeperprojectdl/processed/{season}/rosters")

        standings_df = rosters_df.select(
                col('owner_id'),
                col('roster_id'),
                col('settings.wins'),
                col('settings.losses'),
                concat(col('settings.fpts'), lit('.'), col('settings.fpts_decimal')).alias('points_for'),
                concat(col('settings.fpts_against'), lit('.'), col('settings.fpts_against_decimal')).alias('points_against')
            ) \
            .withColumn('season', lit(season)) \
            .withColumn('ingestion_date', current_timestamp())
        
        standings_final_df = standings_df.withColumn('points_for', col('points_for').cast('double')) \
            .withColumn('points_against', col('points_against').cast('double'))
        
        standings_final_df.write.mode('append').parquet(f"/mnt/sleeperprojectdl/processed/standings")
    else:
        print(f"File not found: {file_path}")

In [0]:
display(spark.read.parquet(f"/mnt/sleeperprojectdl/processed/standings"))

owner_id,roster_id,wins,losses,points_for,points_against,season,ingestion_date
808516037942288384,1,7,7,1790.18,1707.88,2022,2024-07-17T02:29:10.405Z
450908656402165760,2,9,5,1806.42,1720.25,2022,2024-07-17T02:29:10.405Z
833592897646501888,3,4,10,1586.15,1738.57,2022,2024-07-17T02:29:10.405Z
833593227880824832,4,7,7,1608.18,1659.26,2022,2024-07-17T02:29:10.405Z
833594957678886912,5,7,7,1546.27,1706.72,2022,2024-07-17T02:29:10.405Z
833595239125094400,6,5,9,1667.37,1734.19,2022,2024-07-17T02:29:10.405Z
833595795143012352,7,6,8,1801.26,1721.71,2022,2024-07-17T02:29:10.405Z
833614474954006528,8,12,2,2105.84,1688.91,2022,2024-07-17T02:29:10.405Z
839024284771942400,9,5,9,1449.48,1693.78,2022,2024-07-17T02:29:10.405Z
515068627679768576,10,8,6,1683.91,1673.79,2022,2024-07-17T02:29:10.405Z
