### **Import Libraries**

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


In [0]:
# Check the connection
display(
    dbutils.fs.ls(
        f"{wasbs_path}/raw/nfl-2022/"
    )
)

path,name,size,modificationTime
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/PFFScoutingData.csv,PFFScoutingData.csv,2056451,1728705562000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/games.csv,games.csv,40596,1728705558000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/players.csv,players.csv,175869,1728705559000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/plays.csv,plays.csv,4040236,1728705564000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2018.csv,tracking2018.csv,1736922582,1728706731000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2019.csv,tracking2019.csv,1653130011,1728706711000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2020.csv,tracking2020.csv,1607558364,1728706700000


### Create Paths

In [0]:
# Define source and destination directories
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"players","raw","bronze")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/players/


### Create Schema for Games files

## Players Dataset

This dataset contains player-specific information, including identification number, physical attributes, college details, and position.

| Column Name     | Description                                                   |
|-----------------|---------------------------------------------------------------|
| `nflId`         | Unique identification number for each player (numeric).       |
| `Height`        | Height of the player (text).                                   |
| `Weight`        | Weight of the player in pounds (numeric).                     |
| `birthDate`     | Date of birth in format YYYY-MM-DD (date).                    |
| `collegeName`   | College that the player attended (text).                      |
| `Position`      | Player's position on the field (text).                        |
| `displayName`   | Full name of the player (text).   

In [0]:
#Canonical ordered column listo for players
canonicalTripSchemaColList = ["nflId","height","weight","birthDate","collegeName","Position","displayName"]

In [0]:
#Schema for players files
nflPlayersSchema = StructType([
    StructField('nflId', IntegerType(), True), 
    StructField('height', StringType(), True),
    StructField('weight', IntegerType(), True),
    StructField('birthDate', TimestampType(), True),
    StructField('collegeName', StringType(), True),
    StructField('Position', StringType(), True),
    StructField('displayName', StringType(), True)])


In [0]:
display(nflPlayersSchema)

StructType([StructField('nflId', IntegerType(), True), StructField('height', StringType(), True), StructField('weight', IntegerType(), True), StructField('birthDate', TimestampType(), True), StructField('collegeName', StringType(), True), StructField('Position', StringType(), True), StructField('displayName', StringType(), True)])

### Storage in Bronze layer

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

False

In [0]:
#Read de csv from the raw data source using the schema and the path defined above
nflPlayersDf = (spark.read.format("csv")
                .option("header", True)
                .schema(nflPlayersSchema)
                .option("delimiter",",")
                .load(f"{srcDataDirRoot}/players.csv").cache())


# Change the "NA" Values for Null
nflPlayersDf = nflPlayersDf.na.replace("NA", None)

#Order all columns to align with the canonical schema 
nflPlayersDfCanocical = nflPlayersDf.select(*canonicalTripSchemaColList)

display(nflPlayersDfCanocical)

#To make Hive Parquet format compatible with Spark Parquet format
sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")

# Save the files in the bronze directory
nflPlayersDfCanocical.write.format("delta").mode("append").save(destDataDirRoot) 