In [0]:
%run "../set-up/global_variables"

In [0]:
from pyspark.sql.functions import col, concat, lit, current_timestamp
from pyspark.sql.types import StructType, StructField, DecimalType, IntegerType, StringType, BooleanType
import json

In [0]:
users_as_string = dbutils.fs.head("/mnt/sleeperprojectdl/raw/2022/users.json")
users_as_json = json.loads(users_as_string)

metadata_schema = StructType([StructField(item, StringType(), True) for item in users_as_json[0]['metadata']])

users_schema = StructType([
    StructField("user_id", DecimalType(38,0), True),
    StructField("settings", StringType(), True),
    StructField("metadata", metadata_schema, True),
    StructField("league_id", DecimalType(38,0), True),
    StructField("is_owner", BooleanType(), True),
    StructField("is_bot", BooleanType(), True),
    StructField("display_name", StringType(), True),
    StructField("avatar", StringType(), True)
])

In [0]:
for season in ALL_SEASONS.keys():
    file_path = f"/mnt/sleeperprojectdl/raw/{season}/users.json"

    if dbutils.fs.ls(file_path):
        users_df = spark.read.json(file_path, schema=users_schema, multiLine=True)
        users_final_df = users_df.select(
            col('user_id'),
            col('metadata.team_name'),
            col('display_name')
            ) \
            .withColumn('season', lit(season)) \
            .withColumn('ingestion_date', current_timestamp())
        
        users_final_df.write.mode("append").parquet(f"/mnt/sleeperprojectdl/processed/users")
    else:
        print(f"File not found: {file_path}")

In [0]:
display(spark.read.parquet("/mnt/sleeperprojectdl/processed/users"))

user_id,team_name,display_name,season,ingestion_date
450908656402165760,Kermit and Friends,ShawnDeWin,2022,2024-07-16T23:55:25.612Z
515068627679768576,Chicken Coop,merrickwong,2022,2024-07-16T23:55:25.612Z
808516037942288384,mrkoolguy,the_kool_guy14,2022,2024-07-16T23:55:25.612Z
833592897646501888,Trevors Toilets,mawo,2022,2024-07-16T23:55:25.612Z
833593227880824832,Anger Management Clinic,rth15,2022,2024-07-16T23:55:25.612Z
833594957678886912,RADISNEY,tjlopez,2022,2024-07-16T23:55:25.612Z
833595239125094400,Brown Fever,hwinsane,2022,2024-07-16T23:55:25.612Z
833595795143012352,2010 Chargers,VishnuArul,2022,2024-07-16T23:55:25.612Z
833614474954006528,RADISNEY,kongfutranda,2022,2024-07-16T23:55:25.612Z
839024284771942400,The Deon Experience,dli88129,2022,2024-07-16T23:55:25.612Z
