In [0]:
%run "../set-up/global_variables"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, DecimalType, FloatType, IntegerType
from pyspark.sql.functions import col, current_timestamp, lit, concat, when, explode
import json

In [0]:
# In order use dbutils.fs.head, we must make sure our file isn't too large.
# We use 2023's draft_picks.json since the start-up draft file from 2022 would've been too large.
draft_picks_as_string = dbutils.fs.head("/mnt/sleeperprojectdl/raw/2023/draft_picks.json")
draft_picks_as_json = json.loads(draft_picks_as_string)

metadata_schema = StructType([StructField(item, StringType(), True) for item in draft_picks_as_json[0]['metadata'].keys()]
    + [StructField('amount', StringType(), True)])

draft_picks_schema = StructType([
    StructField('round', IntegerType(), True),
    StructField('roster_id', IntegerType(), True),
    StructField('player_id', StringType(), True),
    StructField('picked_by', DecimalType(38,0), True),
    StructField('pick_no', IntegerType(), True),
    StructField('metadata', metadata_schema, True),
    StructField('is_keeper', BooleanType(), True),
    StructField('draft_slot', IntegerType(), True),
    StructField('draft_id', DecimalType(38,0), True)
])

In [0]:
for season in ALL_SEASONS.keys():
    # Set the mount point as our file path; use formatted string to dynamically set the season
    file_path = f"/mnt/sleeperprojectdl/raw/{season}/draft_picks.json"

    # Read the json file and apply the schema
    draft_picks_df = spark.read.json(file_path, schema=draft_picks_schema, multiLine=True)

    # Clean and write the data to the processed folder
    draft_picks_final_df = draft_picks_df.select(
            col('pick_no'),
            col('player_id'),
            col('picked_by'),
            col('metadata.amount')
        ) \
        .withColumn('season', lit(season)) \
        .withColumn('player_id', col('player_id').cast('integer')) \
        .withColumn('amount', col('amount').cast('integer')) \
        .withColumn('ingestion_date', current_timestamp())

    draft_picks_final_df.write.mode('append').parquet(f"/mnt/sleeperprojectdl/processed/draft_picks")

In [0]:
display(spark.read.parquet("/mnt/sleeperprojectdl/processed/draft_picks"))

pick_no,player_id,picked_by,amount,season,ingestion_date
1,6803,808516037942288384,9.0,2022,2024-07-16T23:27:54.099Z
2,7610,833595239125094400,54.0,2022,2024-07-16T23:27:54.099Z
3,4984,808516037942288384,88.0,2022,2024-07-16T23:27:54.099Z
4,6770,833592897646501888,69.0,2022,2024-07-16T23:27:54.099Z
5,4046,450908656402165760,86.0,2022,2024-07-16T23:27:54.099Z
6,6797,515068627679768576,101.0,2022,2024-07-16T23:27:54.099Z
7,4663,808516037942288384,74.0,2022,2024-07-16T23:27:54.099Z
8,7528,833595795143012352,79.0,2022,2024-07-16T23:27:54.099Z
9,6813,833593227880824832,82.0,2022,2024-07-16T23:27:54.099Z
10,4017,833594957678886912,31.0,2022,2024-07-16T23:27:54.099Z
