### **Import Libraries**


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


In [0]:
# Check the connection
display(
    dbutils.fs.ls(
        f"{wasbs_path}/raw/nfl-2022/"
    )
)

path,name,size,modificationTime
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/PFFScoutingData.csv,PFFScoutingData.csv,2056451,1728705562000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/games.csv,games.csv,40596,1728705558000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/players.csv,players.csv,175869,1728705559000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/plays.csv,plays.csv,4040236,1728705564000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2018.csv,tracking2018.csv,1736922582,1728706731000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2019.csv,tracking2019.csv,1653130011,1728706711000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2020.csv,tracking2020.csv,1607558364,1728706700000


### Create Paths

In [0]:
# Define source and destination directories
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"games","raw","bronze")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/games/


### Create Schema for Games files

## Games Dataset

This dataset contains information about NFL games, including game identifiers, season details, and team abbreviations.

| Column Name         | Description                                                                 |
|---------------------|-----------------------------------------------------------------------------|
| `gameId`            | Unique identifier for each game (numeric).                                   |
| `season`            | The season in which the game was played (numeric).                           |
| `week`              | The week of the season in which the game took place (numeric).               |
| `gameDate`          | The date of the game in `MM/DD/YYYY` format (date).                     |
| `gameTimeEastern`    | The start time of the game in Eastern Standard Time (EST), `HH:MM:SS` format (time).|
| `homeTeamAbbr`      | The three-letter abbreviation for the home team (text).                      |
| `visitorTeamAbbr`   | The three-letter abbreviation for the visiting team (text).                  |

In [0]:
#Canonical ordered column list for games
canonicalTripSchemaColList = ["gameId","season","week","gameDate","gameTimeEastern","homeTeamAbbr","visitorTeamAbbr"]

In [0]:
#Schema for games files
nlfGamesSchema = StructType([
    StructField("gameId", IntegerType(), True),
    StructField("season", IntegerType(), True),
    StructField("week", IntegerType(), True),
    StructField("gameDate", StringType(), True),
    StructField("gameTimeEastern", StringType(), True),
    StructField("homeTeamAbbr", StringType(), True),
    StructField("visitorTeamAbbr", StringType(), True)])


In [0]:
#Check Schema
display(nlfGamesSchema)

StructType([StructField('gameId', IntegerType(), True), StructField('season', IntegerType(), True), StructField('week', IntegerType(), True), StructField('gameDate', StringType(), True), StructField('gameTimeEastern', StringType(), True), StructField('homeTeamAbbr', StringType(), True), StructField('visitorTeamAbbr', StringType(), True)])

### Storage in Bronze layer

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

True

In [0]:

#Read de csv from the raw data source using the schema and the path defined above 
nflGamesDf = (spark.read.format("csv")
                .option("header", True)
                .schema(nlfGamesSchema)
                .option("delimiter",",")
                .load(f"{srcDataDirRoot}/games.csv").cache())


# Change the "NA" Values for Null
nflGamesDf = nflGamesDf.na.replace("NA", None)

#Order all columns to align with the canonical schema 
nflGamesDfCanocical = nflGamesDf.select(*canonicalTripSchemaColList)

#Convert the date column to a date type
nflGamesDfCanocical = nflGamesDfCanocical.withColumn("gameDate", to_date(col("gameDate"), "MM/dd/yyyy"))

display(nflGamesDfCanocical)

#To make Hive Parquet format compatible with Spark Parquet format
sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")

# Save the files in the bronze directory
nflGamesDfCanocical.write.format("delta").mode("append").partitionBy("season").save(destDataDirRoot) 