### **Import Libraries**

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


In [0]:
# Check the connection
display(
    dbutils.fs.ls(
        f"{wasbs_path}/raw/nfl-2022/"
    )
)

path,name,size,modificationTime
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/PFFScoutingData.csv,PFFScoutingData.csv,2056451,1728705562000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/games.csv,games.csv,40596,1728705558000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/players.csv,players.csv,175869,1728705559000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/plays.csv,plays.csv,4040236,1728705564000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2018.csv,tracking2018.csv,1736922582,1728706731000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2019.csv,tracking2019.csv,1653130011,1728706711000
wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/tracking2020.csv,tracking2020.csv,1607558364,1728706700000


### Create Paths

In [0]:
# Define source and destination directories
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"tracking","raw","bronze")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/raw/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/tracking/


### Create Schema for Games files

## Tracking Dataset

This dataset contains player tracking data from NFL games, including time-stamped positional and movement details of each player on the field.

| Column Name      | Description                                                                               |
|------------------|-------------------------------------------------------------------------------------------|
| `time`           | Time stamp of play (yyyy-mm-dd, hh:mm:ss).                                                |
| `x`              | Player position along the long axis of the field, from 0 to 120 yards (numeric).           |
| `y`              | Player position along the short axis of the field, from 0 to 53.3 yards (numeric).         |
| `s`              | Player speed in yards per second (numeric).                                               |
| `a`              | Player acceleration in yards per second squared (numeric).                                |
| `dis`            | Distance traveled from prior time point in yards (numeric).                                |
| `o`              | Player orientation in degrees (0 - 360) (numeric).                                        |
| `dir`            | Angle of player motion in degrees (0 - 360) (numeric).                                    |
| `event`          | Details of tagged events during play (e.g., ball snap, pass release, etc.) (text).         |
| `nflId`          | Unique player identification number (numeric).                                            |
| `displayName`    | Player name (text).                                                                       |
| `jerseyNumber`   | Player's jersey number (numeric).                                                         |
| `position`       | Player's position group (text).                                                           |
| `team`           | Team designation, either "home" or "away" (text).                                         |
| `frameId`        | Frame identifier for each play (numeric).                                                 |
| `gameId`         | Unique game identifier (numeric).                                                         |
| `playId`         | Play identifier, not unique across games (numeric).                                       |
| `playDirection`  | Direction in which the offense is moving (left or right) (text).                          |

In [0]:
#Canonical ordered column list for tracking files
canonicalTripSchemaColList = [
    "time","x","y","s","a","dis","o","dir","event","nflId","displayName",
    "jerseyNumber","position","team","frameId","gameId","playId","playDirection"
]

In [0]:
#Schema for tracking files
nflTrackingSchema = StructType([
    StructField('time', TimestampType(), True), 
    StructField('x', DoubleType(), True), 
    StructField('y', DoubleType(), True), 
    StructField('s', DoubleType(), True), 
    StructField('a', DoubleType(), True), 
    StructField('dis', DoubleType(), True), 
    StructField('o', DoubleType(), True), 
    StructField('dir', DoubleType(), True), 
    StructField('event', StringType(), True), 
    StructField('nflId', IntegerType(), True), 
    StructField('displayName', StringType(), True), 
    StructField('jerseyNumber', IntegerType(), True), 
    StructField('position', StringType(), True),
    StructField('team', StringType(), True), 
    StructField('frameId', StringType(), True), 
    StructField('gameId', IntegerType(), True), 
    StructField('playId', IntegerType(), True), 
    StructField('playDirection', StringType(), True)])


In [0]:
display(nflTrackingSchema)

StructType([StructField('time', TimestampType(), True), StructField('x', DoubleType(), True), StructField('y', DoubleType(), True), StructField('s', DoubleType(), True), StructField('a', DoubleType(), True), StructField('dis', DoubleType(), True), StructField('o', DoubleType(), True), StructField('dir', DoubleType(), True), StructField('event', StringType(), True), StructField('nflId', IntegerType(), True), StructField('displayName', StringType(), True), StructField('jerseyNumber', IntegerType(), True), StructField('position', StringType(), True), StructField('team', StringType(), True), StructField('frameId', StringType(), True), StructField('gameId', IntegerType(), True), StructField('playId', IntegerType(), True), StructField('playDirection', StringType(), True)])

### Storage in Bronze layer

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

True

In [0]:
# Create a column of year so it can be partitione by year
def getSchemaHomogenizedDataframe(sourceDF):              
    sourceDF = (sourceDF.withColumn("year",substring(col("time"),0, 4)))
    return sourceDF                      

In [0]:
# Iterate over the years
for i in range(2018,2021):
  #Read de csv from the raw data source using the schema and the path defined above 
  nfltrackingDf = (spark.read.format("csv")
                  .option("header", True)
                  .schema(nflPlayersSchema)
                  .option("delimiter",",")
                  .load(f"{srcDataDirRoot}/tracking{i}.csv").cache())

  # Change the "NA" Values for Null
  nfltrackingDf = nfltrackingDf.na.replace("NA", None)

  #Order all columns to align with the canonical schema
  nfltrackingDfCanocical = nfltrackingDf.select(*canonicalTripSchemaColList)
  
  #Create the year column
  nfltrackingDfFormatted = getSchemaHomogenizedDataframe(nfltrackingDfCanocical)

  display(nfltrackingDfFormatted)
  
  #To make Hive Parquet format compatible with Spark Parquet format
  sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")
  
  # Save the files in the bronze directory
  nfltrackingDfFormatted.write.format("delta").mode("append").partitionBy("year").save(destDataDirRoot) 