### **Import Libraries**

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


### Create Paths

In [0]:
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"plays","bronze","silver")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/silver/nfl-2022/plays/


### Read the files from the bronze layer

In [0]:
# Read the data in delta format
nflPlaysBronze = spark.read.format("delta").load("wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/plays").cache()

# Check the data
display(nflPlaysBronze)

### Exploration of the Data

In [0]:
#create a view of the data
nflPlaysBronze.createOrReplaceTempView("nflPlaysBronze_View")

In [0]:
%sql
-- Average kick length by kickoff
SELECT specialTeamsPlayType, avg(kickLength) AS KickLength_Average
FROM nflPlaysBronze_View
WHERE specialTeamsPlayType = 'Kickoff'
GROUP BY specialTeamsPlayType



specialTeamsPlayType,KickLength_Average
Kickoff,67.06145607548132


In [0]:

display(
    nflPlaysBronze
    .filter(col("specialTeamsPlayType") == "Field Goal")
    .groupBy("specialTeamsPlayType")
    .agg(avg("kickLength").alias("KickLength_Average"))
)

specialTeamsPlayType,KickLength_Average
Field Goal,38.14773599386032


In [0]:
display(
    nflPlaysBronze
    .groupBy("possessionTeam")
    .agg(avg("kickLength").alias("KickLength_Average"))
)

possessionTeam,KickLength_Average
NYJ,52.970093457943925
CAR,56.612631578947365
LA,56.662921348314605
TB,57.44022770398482
OAK,52.524096385542165
DET,53.0
TEN,55.31809145129225
BUF,54.20970873786408
BAL,56.64990689013035
LAC,54.696280991735534


### Create new Columns for Plays DataFrame

### 1. `is_penalty_flag`
- **Description**: A binary column indicating whether a penalty occurred during the play.
- **Logic**: A value of `1` is assigned if the `penaltyCodes` field is not null, meaning a penalty was called. Otherwise, the value is `0`.
- **Usefulness**: Penalties can drastically change the momentum of a game, so this flag allows for focused analysis on how penalties influence game results, player behavior, and overall team performance.

### 2. `is_SuccessfulPlay_flag`
- **Description**: A binary column indicating whether the play was successful based on gaining the required yards for a first down or more.
- **Logic**: A value of `1` is assigned if the `playResult` (net yards gained) is greater than or equal to `yardsToGo` (the distance required for a first down). Otherwise, the value is `0`.
- **Usefulness**: This flag helps to quickly identify successful plays, making it easier to analyze the effectiveness of offensive strategies and the ability of teams to convert downs. It's particularly useful for analyzing efficiency in key moments, such as third downs.

### 3. `is_ReturnPlay_flag`
- **Description**: A binary column indicating whether a return play occurred.
- **Logic**: A value of `1` is assigned if `kickReturnYardage` is greater than 0, meaning the ball was returned during a punt or kickoff. Otherwise, the value is `0`.
- **Usefulness**: This flag is helpful for isolating plays where there was a return, allowing for specific analysis of return team performance, coverage team effectiveness, and overall special teams play. Analyzing return plays can reveal insights into field position advantage and game-changing moments such as long returns.

### 4. `is_Touchdown_flag`
- **Description**: A binary column indicating whether a touchdown play occurred.
- **Logic**: A value of `1` is assigned if `playDescription` contains the word TOUCHDOWN. Otherwise, the value is `0`.
- **Usefulness**: This flag is helpful for helps to quickly identify Touchdown plays, allowing for specific analysis of  scoring trends or  team efficiency.

---

In [0]:
#new columns for the silver layer
nflPlaysSilver = nflPlaysBronze.withColumn(
    "is_penalty_flag",
    when(
        col("penaltyCodes").isNotNull(), 
        1
    ).otherwise(0)
).withColumn(
    "is_SuccessfulPlay_flag",
    when(
        col("playResult") >= col("yardsToGo"), 
        1
    ).otherwise(0)
).withColumn(
    "is_ReturnPlay_flag", 
    when(
        col("kickReturnYardage") > 0, 1
    ).otherwise(0)
).withColumn(
    "is_Touchdown_flag",
    when(
        col("playDescription").contains("TOUCHDOWN"), 1
    ).otherwise(0)
    )

#check the data
display(nflPlaysSilver)


In [0]:
display(nflPlaysSilver.schema)

StructType([StructField('gameId', IntegerType(), True), StructField('playId', IntegerType(), True), StructField('playDescription', StringType(), True), StructField('quarter', IntegerType(), True), StructField('down', IntegerType(), True), StructField('yardsToGo', IntegerType(), True), StructField('possessionTeam', StringType(), True), StructField('specialTeamsPlayType', StringType(), True), StructField('specialTeamsResult', StringType(), True), StructField('kickerId', IntegerType(), True), StructField('returnerId', IntegerType(), True), StructField('kickBlockerId', IntegerType(), True), StructField('yardlineSide', StringType(), True), StructField('yardlineNumber', IntegerType(), True), StructField('gameClock', StringType(), True), StructField('penaltyCodes', StringType(), True), StructField('penaltyJerseyNumbers', StringType(), True), StructField('penaltyYards', IntegerType(), True), StructField('preSnapHomeScore', IntegerType(), True), StructField('preSnapVisitorScore', IntegerType(),

### Save the data in the **Silver** Layer

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

True

In [0]:
#To make Hive Parquet format compatible with Spark Parquet format
sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")

#Save the dataset with the new columns
nflPlaysSilver.write.format("delta").mode("append").save(destDataDirRoot) 