### **Import Libraries**

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


In [0]:
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"PFFScoutingData","bronze","silver")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/silver/nfl-2022/PFFScoutingData/


### Read the files from the bronze layer

In [0]:
# Read the data in delta format
nflPFFScoutingBronze = spark.read.format("delta").load("wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/PFFScoutingData").cache()

# Check the data
display(nflPFFScoutingBronze)

### Exploration of the Data

In [0]:
#create a view of the data
nflPFFScoutingBronze.createOrReplaceTempView("nflPFFScoutingBronze_View")

In [0]:
# Average Hang time per game
display(
    nflPFFScoutingBronze.groupBy('gameId')
    .agg(mean('hangTime').alias('Average Hang Time'))
    .orderBy('Average Hang Time', ascending=False)
)

In [0]:
%sql
-- How many tackles per game?
SELECT gameId, count(tackler) AS TackerCount
FROM nflPFFScoutingBronze_View
GROUP BY gameId
ORDER BY TackerCount DESC

### Create new Columns for PFFScouting DataFrame

### 1. `is_kickDirection_correct_flag`
- **Description**: A binary column indicating whether the intended kick direction matched the actual kick direction.
- **Logic**: A value of `1` is assigned if `kickDirectionIntended` equals `kickDirectionActual`, meaning the kick went as planned. Otherwise, the value is `0`.
- **Usefulness**: This flag helps evaluate the accuracy and execution of special teams plays, providing insights into whether the kicking team executed their strategy correctly. It can also highlight instances where a mismatch might have led to breakdowns in coverage or field position.

### 2. `is_ReturnDirection_correct_flag`
- **Description**: A binary column indicating whether the intended return direction matched the actual return direction.
- **Logic**: A value of `1` is assigned if `returnDirectionIntended` equals `returnDirectionActual`, meaning the return unit followed the planned direction. Otherwise, the value is `0`.
- **Usefulness**: This flag helps in analyzing whether the return team successfully executed their set-up for returns, potentially impacting field position or leading to key return opportunities.

### 3. `TotalPlayTime`
- **Description**: A column representing the total time for the play, summing the snap time, operation time, and hang time.
- **Logic**: The sum of `snapTime`, `operationTime`, and `hangTime` gives the total play time for a punt or kickoff.
- **Usefulness**: This metric helps quantify the duration of special teams plays, which can be critical in evaluating the effectiveness of punts, kickoffs, and coverage. Longer play times might indicate deeper kicks or more time for the coverage team to get downfield.

### 4. `is_successfulTackle_flag`
- **Description**: A binary column indicating whether a tackle was made during the play.
- **Logic**: A value of `1` is assigned if `Tackler` is not null, meaning a tackle occurred. Otherwise, the value is `0`.
- **Usefulness**: This flag helps identify whether a play ended in a successful tackle, which is critical for analyzing defensive effectiveness, especially on punt and kickoff returns. It can also aid in studying missed tackles and defensive breakdowns.

### 5. `MissedTacklerCount`
- **Description**: Counts the number of players who missed a tackle attempt.
- **Logic**: If `MissedTackler` is null, assigns `0`; otherwise, counts the values in MissedTackler (values are separated by `;`).
- **Usefulness**: Helps analyze plays with missed tackles, providing insights into coverage breakdowns and defensive gaps.

---


In [0]:
#new columns for the silver layer
nflPFFScoutingSilver = nflPFFScoutingBronze.withColumn(
    "is_kickDirection_correct_flag",
    when(
        (col("kickDirectionIntended") == col("kickDirectionActual")),
        1
    ).otherwise(0)
).withColumn(
    "is_ReturnDirection_correct_flag",
    when(
        (col("returnDirectionIntended") == col("returnDirectionActual")),
        1
    ).otherwise(0)
).withColumn(
    "TotalPlayTime",
     col("snapTime") + col("operationTime") + col("hangTime")
             
).withColumn(
    "is_successfulTackle_flag", 
    when(
        (col("Tackler").isNull()),0
    ).otherwise(1)
).withColumn(
    "MissedTacklerCount",
    when(
        (col("MissedTackler").isNull()),0
    ).otherwise(size(split(col("MissedTackler"), ";")))
)
     
#check the data
display(nflPFFScoutingSilver)


In [0]:
display(nflPFFScoutingSilver.schema)

StructType([StructField('gameId', IntegerType(), True), StructField('playId', IntegerType(), True), StructField('snapDetail', StringType(), True), StructField('snapTime', FloatType(), True), StructField('operationTime', FloatType(), True), StructField('hangTime', FloatType(), True), StructField('kickType', StringType(), True), StructField('kickDirectionIntended', StringType(), True), StructField('kickDirectionActual', StringType(), True), StructField('returnDirectionIntended', StringType(), True), StructField('returnDirectionActual', StringType(), True), StructField('missedTackler', StringType(), True), StructField('assistTackler', StringType(), True), StructField('tackler', StringType(), True), StructField('kickoffReturnFormation', StringType(), True), StructField('gunners', StringType(), True), StructField('puntRushers', StringType(), True), StructField('specialTeamsSafeties', StringType(), True), StructField('vises', StringType(), True), StructField('kickContactType', StringType(), 

### Save the data in the **Silver** Layer

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

True

In [0]:
#To make Hive Parquet format compatible with Spark Parquet format
sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")

#Save the dataset with the new columns
nflPFFScoutingSilver.write.format("delta").mode("append").save(destDataDirRoot) 