### **Import Libraries**

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


In [0]:
# Check the connection
display(
    dbutils.fs.ls(
        f"{wasbs_path}/raw/nfl-2022/"
    )
)

path,name,size,modificationTime
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/PFFScoutingData.csv,PFFScoutingData.csv,2056451,1728705562000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/games.csv,games.csv,40596,1728705558000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/players.csv,players.csv,175869,1728705559000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/plays.csv,plays.csv,4040236,1728705564000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2018.csv,tracking2018.csv,1736922582,1728706731000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2019.csv,tracking2019.csv,1653130011,1728706711000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2020.csv,tracking2020.csv,1607558364,1728706700000


### Create Paths

In [0]:
# Define source and destination directories
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"plays","raw","bronze")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/plays/


### Create Schema for Games files

## Plays Dataset

This dataset provides detailed information about NFL plays, including identifiers, descriptions, and various statistics related to the plays of the games.

| Column Name              | Description                                                                                               |
|--------------------------|-----------------------------------------------------------------------------------------------------------|
| `gameId`                 | Unique identifier for each game (numeric).                                                                |
| `playId`                 | Unique identifier for each play, not unique across games (numeric).                                        |
| `playDescription`        | A textual description of the play (text).                                                                 |
| `quarter`                | The quarter in which the play took place (numeric).                                                       |
| `down`                   | The down (1st, 2nd, 3rd, or 4th) when the play was executed (numeric).                                    |
| `yardsToGo`              | The number of yards needed for a first down (numeric).                                                    |
| `possessionTeam`         | The team in possession of the ball during the play (text).                                                |
| `specialTeamsPlayType`    | The type of special teams formation: Extra Point, Field Goal, Kickoff, Punt (text).                      |
| `specialTeamsResult`     | The result of the special teams play (Blocked Punt, Return, Touchback, etc.) (text).                      |
| `kickerId`               | NFL ID of the placekicker, punter, or kickoff specialist on the play (numeric).                           |
| `returnerId`             | NFL ID(s) of returner(s) if there was a special teams return, separated by `;` (text).                    |
| `kickBlockerId`          | NFL ID of the player who blocked a field goal or punt (numeric).                                          |
| `yardlineSide`           | 3-letter team code representing the line-of-scrimmage (text).                                             |
| `yardlineNumber`         | Yard line at the line-of-scrimmage (numeric).                                                             |
| `gameClock`              | Time on the clock during the play in `MM:SS` format (time).                                               |
| `penaltyCodes`           | Codes indicating the penalties that occurred during the play, separated by `;` (text).                    |
| `penaltyJerseyNumber`    | Jersey number of the player committing the penalty, with team code (text).                                |
| `penaltyYards`           | The number of yards gained by the possession team due to the penalty (numeric).                           |
| `preSnapHomeScore`       | Home team score before the play occurred (numeric).                                                       |
| `preSnapVisitorScore`    | Visiting team score before the play occurred (numeric).                                                   |
| `passResult`             | Outcome of the scrimmage play, if applicable: Complete, Incomplete, Sack, Interception, etc. (text).       |
| `kickLength`             | Length of the kick in air during a kickoff, field goal, or punt (numeric).                                |
| `kickReturnYardage`      | Yards gained by the return team if there was a return on a kickoff or punt (numeric).                     |
| `playResult`             | Net yards gained by the kicking team, including penalty yardage (numeric).                                |
| `absoluteYardlineNumber` | Location of the ball downfield in tracking data coordinates (numeric).                                     |

In [0]:
#Canonical ordered column list 
canonicalTripSchemaColList = ["gameId","playId","playDescription","quarter","down","yardsToGo","possessionTeam","specialTeamsPlayType","specialTeamsResult","kickerId","returnerId","kickBlockerId","yardlineSide","yardlineNumber","gameClock","penaltyCodes","penaltyJerseyNumbers","penaltyYards","preSnapHomeScore","preSnapVisitorScore","passResult","kickLength","kickReturnYardage","playResult","absoluteYardlineNumber"]

In [0]:
#Schema for plays files
nflPlaysschema = StructType([
    StructField('gameId', IntegerType(), True),
    StructField('playId', IntegerType(), True), 
    StructField('playDescription', StringType(), True), 
    StructField('quarter', IntegerType(), True), 
    StructField('down', IntegerType(), True), 
    StructField('yardsToGo', IntegerType(), True), 
    StructField('possessionTeam', StringType(), True), 
    StructField('specialTeamsPlayType', StringType(), True), 
    StructField('specialTeamsResult', StringType(), True), 
    StructField('kickerId', IntegerType(), True), 
    StructField('returnerId', IntegerType(), True), 
    StructField('kickBlockerId', IntegerType(), True), 
    StructField('yardlineSide', StringType(), True), 
    StructField('yardlineNumber', IntegerType(), True), 
    StructField('gameClock', StringType(), True), 
    StructField('penaltyCodes', StringType(), True), 
    StructField('penaltyJerseyNumbers', StringType(), True), 
    StructField('penaltyYards', IntegerType(), True), 
    StructField('preSnapHomeScore', IntegerType(), True), 
    StructField('preSnapVisitorScore', IntegerType(), True), 
    StructField('passResult', StringType(), True), 
    StructField('kickLength', IntegerType(), True), 
    StructField('kickReturnYardage', IntegerType(), True), 
    StructField('playResult', IntegerType(), True), 
    StructField('absoluteYardlineNumber', IntegerType(), True)])


In [0]:
#Check Schema
display(nflPlaysschema)

StructType([StructField('gameId', IntegerType(), True), StructField('playId', IntegerType(), True), StructField('playDescription', StringType(), True), StructField('quarter', IntegerType(), True), StructField('down', IntegerType(), True), StructField('yardsToGo', IntegerType(), True), StructField('possessionTeam', StringType(), True), StructField('specialTeamsPlayType', StringType(), True), StructField('specialTeamsResult', StringType(), True), StructField('kickerId', IntegerType(), True), StructField('returnerId', IntegerType(), True), StructField('kickBlockerId', IntegerType(), True), StructField('yardlineSide', StringType(), True), StructField('yardlineNumber', IntegerType(), True), StructField('gameClock', StringType(), True), StructField('penaltyCodes', StringType(), True), StructField('penaltyJerseyNumbers', StringType(), True), StructField('penaltyYards', IntegerType(), True), StructField('preSnapHomeScore', IntegerType(), True), StructField('preSnapVisitorScore', IntegerType(),

### Storage in Bronze layer

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

False

In [0]:

#Read de csv from the raw data source using the schema and the path defined above 
nflPlaysDf = (spark.read.format("csv")
                .option("header", True)
                .schema(nflPlaysschema)
                .option("delimiter",",")
                .load(f"{srcDataDirRoot}/plays.csv").cache())

# Change the "NA" Values for Null
nflPlaysDf = nflPlaysDf.na.replace("NA", None)

#Order all columns to align with the canonical schema 
nflPlaysDfCanocical = nflPlaysDf.select(*canonicalTripSchemaColList)

display(nflPlaysDfCanocical)

#To make Hive Parquet format compatible with Spark Parquet format
sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")

# Save the files in the bronze directory
nflPlaysDfCanocical.write.format("delta").mode("append").save(destDataDirRoot) 