### **Import Libraries**

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType, DateType
from pyspark.sql.functions import *

### Execute notebook with common/reusable functions 

In [0]:
%run "../01-General/02-CommonFunctions"

### Connect to the storage

In [0]:
wasbs_path = connect()

Remote blob path: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/


### Create Paths

In [0]:
srcDataDirRoot,destDataDirRoot = route(wasbs_path,"players","bronze","silver")
print(f"Source data dir: {srcDataDirRoot}")
print(f"Destination data dir: {destDataDirRoot}")

Source data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/bronze/nfl-2022/
Destination data dir: wasbs://nfl@bgupb202402juanbarriento.blob.core.windows.net/silver/nfl-2022/players/


### Read the files from the bronze layer

In [0]:
# Read the data in delta format
nflPlayersBronze = spark.read.format("delta").load(f"{srcDataDirRoot}/players").cache()

# Check the data
display(nflPlayersBronze)

### Errors in the dataset

In [0]:
#Counting the null values in the height colum to check an error
null_count = nflPlayersBronze.filter(col("height").isNull()).count()
print(null_count)

0


In [0]:
# The height column has some values that are not in the correct format. 
nflPlayersBronze = nflPlayersBronze.withColumn(
    "height",
    when(
        col("height").rlike("^[0-9]{2}$"),   # Change the values that are in the format 11 to 1-1
        regexp_replace(col("height"), "(\\d)(\\d)", "$1-$2")  
    ).otherwise(col("height"))  
)

### Exploration of the Data

In [0]:
#create a view of the data
nflPlayersBronze.createOrReplaceTempView("nflPlayersBronze_View")

In [0]:
%sql
-- How many player are there in each position?
SELECT Position, count(nflid) AS PlayerCount
FROM nflPlayersBronze_View
GROUP BY Position

Position,PlayerCount
K,57
OLB,200
NT,50
T,188
ILB,117
DB,42
FS,117
LB,42
QB,8
MLB,38


In [0]:
# How much a player can weight based on the height?
display(nflPlayersBronze.groupBy('height')
        .agg(max('weight').alias('Max Weight'),
             min('weight').alias('Min Weight'),
             count('height').alias('Number of Players'))
        .orderBy('height', ascending=False))

height,Max Weight,Min Weight,Number of Players
7-9,265,265,1
7-8,345,265,7
7-7,340,252,18
7-6,307,210,12
7-5,331,190,27
7-4,308,204,19
7-3,311,180,20
7-2,240,194,21
7-1,248,185,13
7-0,225,175,9


In [0]:
# How many colleges are there?
display(nflPlayersBronze.groupBy('collegeName')
                .agg(countDistinct('nflid').alias('Number of Players'))
                .orderBy('Number of Players', ascending=False))

323

### Create new Columns for Games DataFrame


### 1. `heightNumeric`
- **Description**: Converts the player's height from feet-inches format (e.g., "6-2") into a decimal format (e.g., 6.2).
- **Logic**: The `split` function breaks the height into feet and inches, then concatenates them with a decimal point between them. The result is cast as a `double` to ensure it is in numeric form.
- **Usefulness**: Converting height into a numeric format simplifies calculations and comparisons. It also makes it easier to use this data for further metric conversions and statistical modeling.

### 2. `heightMeters`
- **Description**: Converts the player's height from feet-inches format to meters.
- **Logic**: A user-defined function (`height_to_meters_udf`) is used to convert the height in decimal format to meters.
- **Usefulness**: Working with height in meters aligns with the metric system, making the data easier to interpret in international contexts or standardized scientific models. This can be particularly useful in studies requiring metric units for player performance and fitness analysis.

### 3. `weightKilograms`
- **Description**: Converts the player's weight from pounds to kilograms.
- **Logic**: The column `weight` is multiplied by 0.45 to convert pounds to kilograms.
- **Usefulness**: This allows weight to be analyzed in the metric system, ensuring consistency across datasets that might require metric units, particularly for studies or reports that involve international comparisons or need standardized measurement units.

### 4. `BMI` (Body Mass Index)
- **Description**: Calculates the Body Mass Index (BMI) for each player.
- **Logic**: BMI is calculated using the formula: BMI = weightKilograms\heightMeters^2

- **Usefulness**: BMI provides a standardized measure for assessing a player's body composition relative to their height and weight. Although BMI has limitations, it can give a basic indicator of fitness levels or whether a player is within certain weight categories for their height, which is useful in athletic performance evaluations.

---

In [0]:
# funtion to convert the feet and inches to meters
def height_to_meters(decimal_height):
    feet = int(decimal_height)
    inches = (decimal_height - feet) * 12
    height_meters = (feet * 0.3048) + (inches * 0.0254)
    return height_meters

# Register the UDF to change to double type
height_to_meters_udf = udf(height_to_meters, DoubleType())

#new columns for the silver layer
nflPlayersSilver = nflPlayersBronze.withColumn(
    "heightNumeric",
    concat(
        split(col("height"), "-")[0],
        lit("."),
        split(col("height"), "-")[1]
    ).cast("double")
).withColumn(
    "heightMeters",
    height_to_meters_udf(col("heightNumeric"))
).withColumn(
    "weightKilograms",
    col("weight") * 0.45
).withColumn(
    "BMI", #Body mass index
    col("weightKilograms") / (col("heightMeters")**2)
)

#check the data
display(nflPlayersSilver)

In [0]:
display(nflPlayersSilver.schema)

StructType([StructField('nflId', IntegerType(), True), StructField('height', StringType(), True), StructField('weight', IntegerType(), True), StructField('birthDate', TimestampType(), True), StructField('collegeName', StringType(), True), StructField('Position', StringType(), True), StructField('displayName', StringType(), True), StructField('heightNumeric', DoubleType(), True), StructField('heightMeters', DoubleType(), True), StructField('weightKilograms', DoubleType(), True), StructField('BMI', DoubleType(), True)])

### Save the data in the **Silver** Layer

In [0]:
#Delete any residual data from prior executions for an idempotent run
dbutils.fs.rm(destDataDirRoot,recurse=True)

True

In [0]:
#To make Hive Parquet format compatible with Spark Parquet format
sqlContext.setConf("spark.sql.parquet.writeLegacyFormat", "true")

#Save the dataset with the new columns
nflPlayersSilver.write.format("delta").mode("append").save(destDataDirRoot) 