### **Import Libraries**

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


### Create Paths

In [0]:
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"tracking","bronze","silver")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/silver/nfl-2022/tracking/


### Read the files from the bronze layer

In [0]:
# Read the data in delta format
nflTrackingBronze = spark.read.format("delta").load(f"{srcDataDirRoot}/tracking").cache()

# Check the data
display(nflTrackingBronze)

###Explore the data

In [0]:
#Create a view
nflTrackingBronze.createOrReplaceTempView("nflTrackingBronze_View")

In [0]:
%sql
-- all possible events
SELECT DISTINCT event
FROM nflTrackingBronze_View


event
punt_play
qb_strip_sack
first_contact
snap_direct
field_goal
punt_fake
""
extra_point_missed
fumble_defense_recovered
fumble


In [0]:
%sql
-- Max and min distance for each player in the year 2018
SELECT displayName, max(dis),min(dis) 
from nflTrackingBronze_View 
where year=2018 
group by displayName

In [0]:
# How many games were played in each year?
display(nflTrackingBronze.groupBy('year')
                .agg(countDistinct('gameId').alias('Number of games'))
                .orderBy('year'))

year,Number of games
2018,253
2019,255
2020,240
2021,16


### Create new Columns for Tracking DataFrame


## 1. `is_endzone_flag`
- **Description**: This flag indicates whether a player is located within an endzone during a specific moment of the game.
- **Logic**: The `x` coordinate is used to determine if the player is inside the endzone. If `x` is between 0 and 20 (team's defensive endzone) or between 100 and 120 (opponent's endzone), the flag is set to 1. Otherwise, it is 0.
- **Usefulness**: This column is crucial for analyzing events that occur within the endzones, such as touchdowns, safeties, or key defensive plays. It helps isolate plays and moments that occur in critical areas of the field.

## 2. `s_ms` (Speed in meters per second)
- **Description**: Converts the player's speed from yards per second to meters per second.
- **Logic**: The player's speed (`s`) is multiplied by 0.9144, which is the conversion factor from yards to meters.
- **Usefulness**: This makes the speed metric compatible with international units (SI units), facilitating easier analysis and comparisons, especially for studies involving physical performance metrics or international audiences.

## 3. `a_ms2` (Acceleration in meters per second squared)
- **Description**: Converts the player's acceleration from yards per second² to meters per second².
- **Logic**: The acceleration (`a`) is multiplied by 0.9144 to convert yards per second² to meters per second².
- **Usefulness**: Similar to speed, this conversion standardizes acceleration measurements to the metric system, making it easier to analyze acceleration data in scientific or international contexts.

## 4. `dis_m` (Distance traveled in meters)
- **Description**: Converts the distance a player travels between frames from yards to meters.
- **Logic**: The distance (`dis`) is multiplied by 0.9144 to convert yards to meters.
- **Usefulness**: Like the other metrics, this standardizes the unit of measurement to meters, making it easier for users who work with the metric system or want to compare player performance with scientific studies that typically use meters.

## 5. `date`
- **Description**: Extracts the date from the `time` column.
- **Logic**: The `to_date` function is used to extract only the date (YYYY-MM-DD) from the timestamp.
- **Usefulness**: Separating the date makes it easier to analyze and group data based on specific days, allowing analysts to study game events or trends across different game dates.

## 6. `hour`
- **Description**: Extracts the hour of the day from the `time` column.
- **Logic**: The `hour` function is used to extract the hour (HH) from the timestamp.
- **Usefulness**: Analyzing game data by the hour can help study the impact of time-of-day factors on player performance, such as fatigue in later hours or higher/lower performance in night games.

---

In [0]:
#new columns for the silver layer
nflTrackingSilver = nflTrackingBronze.withColumn(
    "is_endzone_flag",
    when(
        (col("x") <= 20) & (col("x") >= 0) | (col("x") <= 120) & (col("x") >= 100),
        1
    ).otherwise(0)
).withColumn(
    "s_ms", 
    col("s") * 0.9144 
).withColumn(
    "a_ms2",
    (col("a") * 0.9144)
).withColumn(
    "dis_m",
    (col("dis") * 0.9144)
).withColumn(
    "date", 
    to_date("time")
).withColumn(
    "hour",
    hour("time")
)

#check the data
display(nflTrackingSilver)

In [0]:
display(nflTrackingSilver.schema)

StructType([StructField('time', TimestampType(), True), StructField('x', DoubleType(), True), StructField('y', DoubleType(), True), StructField('s', DoubleType(), True), StructField('a', DoubleType(), True), StructField('dis', DoubleType(), True), StructField('o', DoubleType(), True), StructField('dir', DoubleType(), True), StructField('event', StringType(), True), StructField('nflId', StringType(), True), StructField('displayName', StringType(), True), StructField('jerseyNumber', IntegerType(), True), StructField('position', StringType(), True), StructField('team', StringType(), True), StructField('frameId', StringType(), True), StructField('gameId', StringType(), True), StructField('playId', StringType(), True), StructField('playDirection', StringType(), True), StructField('year', StringType(), True), StructField('is_endzone_flag', IntegerType(), False), StructField('s_ms', DoubleType(), True), StructField('a_ms2', DoubleType(), True), StructField('dis_m', DoubleType(), True), Struct

### Save the data in the **Silver** Layer

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

True

In [0]:
#To make Hive Parquet format compatible with Spark Parquet format
sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")

#Save the dataset with the new columns
nflTrackingSilver.write.format("delta").mode("append").partitionBy("year").save(destDataDirRoot) 