### **Import Libraries**

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


### Create Paths

In [0]:
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"games","bronze","silver")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/silver/nfl-2022/games/


### Read the files from the bronze layer

In [0]:
# Read the data in delta format
nflGamesBronze = spark.read.format("delta").load("wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/games").cache()

# Check the data
display(nflGamesBronze)

### Exploration of the Data

In [0]:
#create a view of the data
nflGamesBronze.createOrReplaceTempView("nflGamesBronze_View")

In [0]:
%sql
-- How many games per season?
SELECT season,count(gameId) AS Games_Count
FROM nflGamesBronze_View
GROUP BY season
ORDER BY season DESC

season,Games_Count
2020,256
2019,255
2018,253


In [0]:
# How many games were played in the 2020 season for a team being a local?
display(
    nflGamesBronze
    .filter(col("season") == 2020)  
    .groupBy("homeTeamAbbr")
    .agg(count("gameId").alias("Games_Count"))
)

homeTeamAbbr,Games_Count
NYJ,8
CAR,8
LA,8
TB,8
DET,8
TEN,8
BUF,8
BAL,8
LAC,8
NE,8


In [0]:
# How many games were played in the 2020 season for a team being a visitor?
display(
    nflGamesBronze
    .filter(col("season") == 2019) 
    .groupBy("visitorTeamAbbr")
    .agg(count("gameId").alias("Games_Count"))
)

visitorTeamAbbr,Games_Count
NYJ,8
CAR,8
LA,8
TB,8
OAK,8
DET,8
TEN,8
BUF,8
BAL,8
LAC,8


### Create new Columns for Games DataFrame


### 1. `is_playoff_flag`
- **Description**: A binary column indicating whether a game is part of the playoffs.
- **Logic**: A value of `1` is assigned if the `gameDate` falls within the playoff periods for the seasons 2018, 2019, or 2020. Otherwise, the value is `0`.
- **Usefulness**: This column helps differentiate between regular season and playoff games, allowing for more focused analysis of team and player performance during high-stakes playoff games.

### 2. `is_superbowl_flag`
- **Description**: A binary column indicating whether a game is the Super Bowl.
- **Logic**: A value of `1` is assigned if the `gameDate` matches the specific dates of the Super Bowls for the seasons 2018, 2019, and 2020. Otherwise, the value is `0`.
- **Usefulness**: This column is critical for isolating Super Bowl games, enabling specialized analysis of the most important game of the season.

### 3. `is_weekend_flag`
- **Description**: A binary column that flags if a game occurred on a weekend (Saturday or Sunday).
- **Logic**: A value of `1` is assigned if the game was played on a Saturday (`day_of_the_week = 7`) or a Sunday (`day_of_the_week = 1`). Otherwise, the value is `0`.
- **Usefulness**: NFL games are predominantly played on weekends, and this column can be used to analyze game attendance, TV viewership, or player performance on weekend vs. weekday games.

### 4. `game_hour`
- **Description**: Extracts the hour portion from the `gameTimeEastern` field (game start time).
- **Logic**: This column takes the first two characters from `gameTimeEastern`, which represent the hour in Eastern Time.
- **Usefulness**: By isolating the hour of the game, this column enables analysis of game times and how they may influence player performance, viewership, or other time-dependent factors.

### 5. `is_night_hour_flag`
- **Description**: A binary column indicating whether a game was played during night hours.
- **Logic**: A value of `1` is assigned if the `game_hour` falls between 18 (6 PM) and 23 (11 PM), indicating nighttime. Otherwise, the value is `0`.
- **Usefulness**: This column helps identify night games, which often have different conditions (e.g., cooler temperatures, higher viewership) compared to day games. It allows for the segmentation of games based on time-of-day factors.

---

In [0]:

#new columns for the silver layer
nflGamesSilver = nflGamesBronze.withColumn(
    "is_playoff_flag",
    when( 
         # Dates of the playoffs
        (col("gameDate").between(lit("2018-01-06"), lit("2018-01-21"))) |  
        (col("gameDate").between(lit("2019-01-05"), lit("2019-01-20"))) | 
        (col("gameDate").between(lit("2020-01-04"), lit("2020-01-19"))),
        1
    ).otherwise(0)
).withColumn(
    "is_superbowl_flag",
    when( 
         #Dates of the Superbowls
        (col("gameDate") == lit("2018-01-23")) | 
        (col("gameDate") == lit("2019-02-03")) | 
        (col("gameDate") == lit("2020-02-02")),
        1
    ).otherwise(0)
).withColumn(
    'day_of_the_week', 
    dayofweek(col("gameDate"))
).withColumn(
    'is_weekend_flag', 
    when(
        # 1: Sunday, 7: Saturday
        (col('day_of_the_week') == 1) | (col('day_of_the_week') == 7),         
        1
    ).otherwise(0)
).withColumn(
    "game_hour",
    substring(col("gameTimeEastern"),0, 2)
).withColumn(
    'is_night_hour_flag', 
    when( 
        (col('game_hour') >= 18) & (col('game_hour') <= 23), 
        # Those are the hours I night time hours
        1
    ).otherwise(0)
).drop('day_of_the_week')

#check the data
display(nflGamesSilver)


In [0]:
display(nflGamesSilver.schema)

StructType([StructField('gameId', IntegerType(), True), StructField('season', IntegerType(), True), StructField('week', IntegerType(), True), StructField('gameDate', DateType(), True), StructField('gameTimeEastern', StringType(), True), StructField('homeTeamAbbr', StringType(), True), StructField('visitorTeamAbbr', StringType(), True), StructField('is_playoff_flag', IntegerType(), False), StructField('is_superbowl_flag', IntegerType(), False), StructField('is_weekend_flag', IntegerType(), False), StructField('game_hour', StringType(), True), StructField('is_night_hour_flag', IntegerType(), False)])

### Save the data in the **Silver** Layer

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

True

In [0]:
#To make Hive Parquet format compatible with Spark Parquet format
sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")

#Save the dataset with the new columns
nflGamesSilver.write.format("delta").mode("append").partitionBy("season").save(destDataDirRoot) 