- # Silver layer
    - ## Clean, deduplicate, typed, validated

In [None]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

builder = SparkSession.builder \
    .appName("LocalDeltaLake") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
from pyspark.sql.functions import col, to_date

bronze_input = "../data/bronze/openpowerlifting_bronze.parquet"
silver_output = "../data/silver/openpowerlifting_silver.parquet"

def run_etl(input_path: str, output_path: str):
    df = spark.read.parquet(input_path)
    
    required_cols = ["Name", "Sex", "Event", "Equipment", "Age", "date"]
    
    df_clean = (
        df.na.drop(subset=required_cols)
          .withColumn("date", to_date(col("Date"), "yyyy-MM-dd"))  # Check actual column name
    )

    df_clean.write.format("parquet").mode("overwrite").save(output_path)

    return df_clean



df_silver = run_etl(bronze_input, silver_output)

# EDA

In [None]:
df_silver.printSchema()

In [None]:
print(df_silver.count())

In [None]:
from pyspark.sql.functions import col, sum

df_silver.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in df_silver.columns
]).show()


In [None]:
df_silver.groupBy("Sex").count().orderBy("count", ascending=False).show()

In [None]:
df_silver.select("Age").describe().show()
df_silver.select("Age").groupBy("Age").count().orderBy("Age").show()


In [None]:
df_silver.groupBy("AgeClass").count().orderBy("count", ascending=False).show()

In [None]:
df_silver.groupBy("BirthYearClass").count().orderBy("count", ascending=False).show()

In [None]:
df_silver.groupBy("Event").count().show()

In [None]:
df_silver.groupBy("Equipment").count().show()

In [None]:
df_silver.groupBy("Tested").count().show()

In [None]:
df_silver.groupBy("Federation").count().orderBy("count", ascending=False).show(20)
df_silver.groupBy("ParentFederation").count().orderBy("count", ascending=False).show(20)


In [None]:
df_silver.select("Best3SquatKg", "Best3BenchKg", "Best3DeadliftKg", "TotalKg","Dots", "Wilks").describe().show()

In [None]:
pdf = df_silver.select("Best3SquatKg", "Best3BenchKg", "Best3DeadliftKg", "TotalKg","Dots", "Wilks").toPandas()
pdf.corr()

In [None]:
df_silver.groupBy("Country").count().orderBy("count", ascending=False).show()
df_silver.groupBy("MeetCountry").count().orderBy("count", ascending=False).show(20)


In [None]:
df_silver.groupBy("State").count().orderBy("count", ascending=False).show(20)
df_silver.groupBy("MeetState").count().orderBy("count", ascending=False).show(20)
