# 02. Transformação e Limpeza - Camada Silver

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, trim, lower, regexp_replace, sum, avg, first, exp, when, row_number, concat, lit, concat_ws
from pyspark.sql.window import Window
from pyspark.sql.types import StringType
import unicodedata

In [0]:
stats_df = spark.read.table("workspace.nba_lakehouse_bronze.nba_stats_bronze")
adp_df = spark.read.table("workspace.nba_lakehouse_bronze.nba_adp_bronze")
proj_df = spark.read.table("workspace.nba_lakehouse_bronze.nba_projections_bronze")
player_info_df = spark.read.table("workspace.nba_lakehouse_bronze.kaggle_player_info_bronze")
country_map_df = spark.read.table("workspace.nba_lakehouse_bronze.country_codes_bronze")

In [0]:
stats_df.printSchema()
adp_df.printSchema()
proj_df.printSchema()
player_info_df.printSchema()
country_map_df.printSchema()

### Funções de Limpeza e Normalização

In [0]:
def remove_accents(input_str):
    if input_str is None:
        return None
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

remove_accents_udf = F.udf(remove_accents, StringType())

In [0]:
def clean_stats(df):
    player_window = Window.partitionBy("Player").orderBy(when(col("Team").rlike("(?i)TOT"), 0).otherwise(1))
    df_filtered = df.withColumn("row_num", row_number().over(player_window)).filter(col("row_num") == 1).drop("row_num")
    
    df_normalized = df_filtered.withColumn("normalized_player", remove_accents_udf(col("Player")))
    
    return (
        df_normalized.withColumn("Player_Key", trim(lower(regexp_replace(col("normalized_player"), r'[^\w\s]', ''))))
            .groupBy("Player_Key", "Player")
            .agg(
                first("Pos").alias("Pos"),
                first("G").alias("G"),
                first("MP").alias("MP"),
                first("PTS").alias("PTS"),
                first("TRB").alias("TRB"),
                first("AST").alias("AST"),
                first("STL").alias("STL"),
                first("BLK").alias("BLK"),
                first("TOV").alias("TOV"),
                first("FG").alias("FG"),
                first("FGA").alias("FGA"),
                first("FT").alias("FT"),
                first("FTA").alias("FTA"),
                first("3P").alias("3P"),
                first("3PA").alias("3PA")
            )
            # --- CORREÇÃO ---
            # Removido 'exp()' do cálculo das porcentagens
            .withColumn("FG_PCT", F.try_divide(col("FG"), col("FGA")))
            .withColumn("3P_PCT", F.try_divide(col("3P"), col("3PA")))
            .withColumn("FT_PCT", F.try_divide(col("FT"), col("FTA")))
            # ---------------------
            .select(
                "Player_Key", "Player", "Pos", "G", "MP",
                "PTS", "TRB", "AST", "STL", "BLK", "TOV",
                "FG_PCT", "3P_PCT", "FT_PCT"
            )
    )

In [0]:
def clean_adp(df):
    df_normalized = df.withColumn("normalized_player", remove_accents_udf(col("Player")))
    
    return (
        df_normalized.withColumn("Player_Key", trim(lower(regexp_replace(col("normalized_player"), r'[^\w\s]', ''))))
            .select(
                col("Player_Key"),
                col("Rank").alias("ADP_Rank"),
                col("AVG").alias("ADP_Avg")
            )
    )

In [0]:
def clean_proj(df):
    df_normalized = df.withColumn("normalized_player", remove_accents_udf(col("Player")))
    
    return (
        df_normalized.withColumn("Player_Key", trim(lower(regexp_replace(col("normalized_player"), r'[^\w\s]', ''))))
            .select(
                col("Player_Key"),
                col("PTS").alias("Projected_PTS"),
                col("TRB").alias("Projected_TRB"),
                col("AST").alias("Projected_AST"),
                col("STL").alias("Projected_STL"),
                col("BLK").alias("Projected_BLK"),
                col("TOV").alias("Projected_TOV")
            )
    )

In [0]:
def transform_player_info_to_silver(df_bronze, df_country_map):
    
    df_with_key = df_bronze.withColumn(
        "Player_Name",
        concat_ws(' ', col("fname"), col("lname"))
    ).withColumn(
        "Player_Key",
        trim(lower(regexp_replace(remove_accents_udf(col("Player_Name")), r'[^\w\s]', '')))
    )
    
    df_with_url = df_with_key.withColumn(
        "Image_URL",
        concat(
            lit("https://cdn.nba.com/headshots/nba/latest/1040x760/"),
            col("playerid"),
            lit(".png")
        )
    )
    
    df_with_flag = df_with_url.join(
        df_country_map,
        df_with_url.country == df_country_map.Name,
        "left"
    ).withColumn(
        "Flag_URL",
        when(col("Code").isNotNull(), 
             concat(lit("https://flagcdn.com/w40/"), col("Code"), lit(".png")))
        .otherwise(lit("https://flagcdn.com/w40/un.png"))
    )
    # ---------------------

    df_silver_players = df_with_flag.select(
        col("Player_Key"),
        col("playerid").alias("Player_ID_NBA"),
        col("Player_Name"),
        col("Image_URL"),
        col("Flag_URL"),
        col("position").alias("Position"),
        col("height").alias("Height_Str"),
        col("weight").alias("Weight_kg"),
        col("birthday").alias("Birthdate"),
        col("country").alias("Country"),
        col("school").alias("School"),
        col("draft_year").alias("Draft_Year"),
        col("draft_round").alias("Draft_Round"),
        col("draft_number").alias("Draft_Number")
    )
    
    return df_silver_players

### Execução da Limpeza e Transformação

In [0]:
stats_clean = clean_stats(stats_df)
adp_clean = clean_adp(adp_df)
proj_clean = clean_proj(proj_df)

df_silver_player_dims = transform_player_info_to_silver(player_info_df, country_map_df)

### Junção e Geração da Tabela Fato Silver

In [0]:
silver_df = (
    stats_clean.join(adp_clean, "Player_Key", "left")
               .join(proj_clean, "Player_Key", "left")
               .filter(col("ADP_Rank").isNotNull())
               .select(
                   "Player_Key", "Player", "Pos", "G",
                   col("PTS").alias("Past_Season_PTS"),
                   col("TRB").alias("Past_Season_TRB"),
                   col("AST").alias("Past_Season_AST"),
                   col("STL").alias("Past_Season_STL"),
                   col("BLK").alias("Past_Season_BLK"),
                   col("TOV").alias("Past_Season_TOV"),
                   "Projected_PTS", "Projected_TRB", "Projected_AST",
                   "Projected_STL", "Projected_BLK", "Projected_TOV",
                   "ADP_Rank", "ADP_Avg"
               )
)

In [0]:
display(silver_df.limit(10))

### Exportação para Camada Silver

In [0]:
def save_dataframe_as_table(df, table_name, mode="overwrite"):
    df.write.mode(mode).format("delta").option("overwriteSchema", "true").saveAsTable(table_name)
    print(f"Tabela '{table_name}' salva com sucesso.")

In [0]:
save_dataframe_as_table(silver_df, "workspace.nba_lakehouse_silver.fato_ranking_fantasy")
save_dataframe_as_table(df_silver_player_dims, "workspace.nba_lakehouse_silver.dim_jogadores_silver")