### **Import Libraries**


In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


In [0]:
# Check the connection
display(
    dbutils.fs.ls(
        f"{wasbs_path}/raw/nfl-2022/"
    )
)

path,name,size,modificationTime
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/PFFScoutingData.csv,PFFScoutingData.csv,2056451,1728705562000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/games.csv,games.csv,40596,1728705558000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/players.csv,players.csv,175869,1728705559000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/plays.csv,plays.csv,4040236,1728705564000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2018.csv,tracking2018.csv,1736922582,1728706731000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2019.csv,tracking2019.csv,1653130011,1728706711000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2020.csv,tracking2020.csv,1607558364,1728706700000


### Create Paths

In [0]:
# Define source and destination directories
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"PFFScoutingData","raw","bronze")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/PFFScoutingData/


### Create Schema for Games files

## PFF Scouting Data Dataset

This dataset contains detailed scouting data regarding special teams plays, including snap details, kick types, directions of kicks, and player roles.

| Column Name               | Description                                                                                                                                             |
|---------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|
| `gameId`                  | Unique identifier for each game (numeric).                                                                                                              |
| `playId`                  | Unique identifier for each play, not unique across games (numeric).                                                                                      |
| `snapDetail`              | Information about whether the snap was on target during punts, with details like High, Low, Left, Right, or OK for an accurate snap (text).               |
| `snapTime`                | Time from the star of the event to the snap, in seconds                                                                           (numeric).            |
| `operationTime`           | Time from snap to kick on punt plays, in seconds (numeric).                                                                                              |
| `hangTime`                | Time the ball is in the air from the punt or kickoff, measured in seconds (numeric).                                                                     |
| `kickType`                | Type of kick or punt, with values like Deep (D), Flat (F), Free Kick (K), Pooch Kick (P), Rugby style punt (R), etc. (text).                             |
| `kickDirectionIntended`   | Intended direction of the kick from the kicking team's perspective (Left, Right, Center) (text).                                                         |
| `kickDirectionActual`     | Actual direction of the kick from the kicking team's perspective (Left, Right, Center) (text).                                                           |
| `returnDirectionIntended` | Intended return direction from the return team's perspective (Left, Right, Center) (text).                                                               |
| `returnDirectionActual`   | Actual return direction from the return team's perspective (Left, Right, Center) (text).                                                                 |
| `missedTacklers`          | List of players (jersey number and team code) who missed tackles, separated by `;` if multiple (text).                                                   |
| `assistTacklers`          | List of players (jersey number and team code) who assisted in tackles, separated by `;` if multiple (text).                                              |
| `tacklers`                | List of players (jersey number and team code) who made the tackle (text).                                                                                |
| `kickoffReturnFormation`  | 3-digit code indicating the number of players in the Front Wall, Mid Wall, and Back Wall (text).                                                         |
| `gunners`                 | List of players (jersey number and team code) acting as gunners on the punt unit, separated by `;` if multiple (text).                                   |
| `puntRushers`             | List of players (jersey number and team code) on the punt return unit with the "Punt Rush" role (text).                                                  |
| `specialTeamsSafeties`    | List of players (jersey number and team code) acting as safeties on special teams, separated by `;` if multiple (text).                                  |
| `vises`                   | List of players (jersey number and team code) acting as vises on the punt return unit, separated by `;` if multiple (text).                              |
| `kickContactType`         | Details on how a punt was fielded, with values such as Clean Catch (CC), Muffed by Returner (MBDR), Bounced Forwards (BF), and Directly Out Of Bounds (OOB) (text). |

In [0]:
#Canonical ordered column list for the PFF Scouting data
canonicalTripSchemaColList = ["gameId","playId","snapDetail","snapTime","operationTime","hangTime","kickType","kickDirectionIntended","kickDirectionActual","returnDirectionIntended","returnDirectionActual","missedTackler","assistTackler","tackler","kickoffReturnFormation","gunners","puntRushers","specialTeamsSafeties","vises","kickContactType"]

In [0]:
#Schema for the PFF Scouting data files
nflPFFScoutingDataschema = StructType([
    StructField("gameId", IntegerType(), True),
    StructField("playId", IntegerType(), True),
    StructField("snapDetail", StringType(), True),
    StructField("snapTime", FloatType(), True),
    StructField("operationTime", FloatType(), True),
    StructField("hangTime", FloatType(), True),
    StructField("kickType", StringType(), True),
    StructField("kickDirectionIntended", StringType(), True),
    StructField("kickDirectionActual", StringType(), True),
    StructField("returnDirectionIntended", StringType(), True),
    StructField("returnDirectionActual", StringType(), True),
    StructField("missedTackler", StringType(), True),
    StructField("assistTackler", StringType(), True),
    StructField("tackler", StringType(), True),
    StructField("kickoffReturnFormation", StringType(), True),
    StructField("gunners", StringType(), True),
    StructField("puntRushers", StringType(), True),
    StructField("specialTeamsSafeties", StringType(), True),
    StructField("vises", StringType(), True),
    StructField("kickContactType", StringType(), True)
])


In [0]:
display(nflPFFScoutingDataschema)

StructType([StructField('gameId', IntegerType(), True), StructField('playId', IntegerType(), True), StructField('snapDetail', StringType(), True), StructField('snapTime', FloatType(), True), StructField('operationTime', FloatType(), True), StructField('hangTime', FloatType(), True), StructField('kickType', StringType(), True), StructField('kickDirectionIntended', StringType(), True), StructField('kickDirectionActual', StringType(), True), StructField('returnDirectionIntended', StringType(), True), StructField('returnDirectionActual', StringType(), True), StructField('missedTackler', StringType(), True), StructField('assistTackler', StringType(), True), StructField('tackler', StringType(), True), StructField('kickoffReturnFormation', StringType(), True), StructField('gunners', StringType(), True), StructField('puntRushers', StringType(), True), StructField('specialTeamsSafeties', StringType(), True), StructField('vises', StringType(), True), StructField('kickContactType', StringType(), 

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

True

### Storage in Bronze layer

In [0]:

#Read de csv from the raw data source using the schema and the path defined above 
nflPFFScoutingDataDf = (spark.read.format("csv")
                .option("header", True)
                .schema(nflPFFScoutingDataschema)
                .option("delimiter",",")
                .load(f"{srcDataDirRoot}/PFFScoutingData.csv").cache())

# Change the "NA" Values for Null
nflPFFScoutingDataDf = nflPFFScoutingDataDf.na.replace("NA", None)

#Order all columns to align with the canonical schema 
nflPFFScoutingDataDfCanocical = nflPFFScoutingDataDf.select(*canonicalTripSchemaColList)

display(nflPFFScoutingDataDfCanocical)

#To make Hive Parquet format compatible with Spark Parquet format
sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")

# Save the files in the bronze directory
nflPFFScoutingDataDfCanocical.write.format("delta").mode("append").save(destDataDirRoot) 