# **PROJET DATA ENGINEERING**

## ARCHITECTURE DU PROJET

In [0]:
CATALOG = "workspace"
SCHEMA  = "xhadeezeydia"
VOLUME  = "capstoneipsl"

spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.{VOLUME}")

VOLUME_ROOT = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}"
PROJECT_ROOT = f"{VOLUME_ROOT}/ecommerce_project"

DIRECTORIES = [

    # BRONZE
    "data/bronze/main",
    "data/bronze/enrich",

    # SILVER
    "data/silver/main_clean",
    "data/silver/enrich_clean",
    "data/silver/joined",

    # GOLD
    "data/gold/marts",
    "data/gold/aggregates",
    "data/gold/exports",

    # CODE
    "src/ingestion",
    "src/transforms",
    "src/quality",
    "src/utils",

    # ORCHESTRATION
    "notebooks",
    "configs",

    # REPORTS
    "reports/data_quality",
    "reports/benchmarks"
]


for d in DIRECTORIES:
    path = f"{PROJECT_ROOT}/{d}"
    dbutils.fs.mkdirs(path)
    print(f"âœ“ Created: {path}")


dbutils.fs.ls(f"{PROJECT_ROOT}/data")

## L'Ingestion et l'Amplification(BRONZE)

In [0]:
# 1. Chargement et Fusion (Source 1)
df_raw = spark.read.csv(f"{PROJECT_ROOT}/data/bronze/main/*.csv", header=True, inferSchema=True)

# 2. Amplification Massive 
df_amplified = df_raw
for _ in range(14):
    df_amplified = df_amplified.unionAll(df_raw)

# 3. Ã‰criture en PARQUET
bronze_main_path = f"{PROJECT_ROOT}/data/bronze/main/full_data.parquet"
df_amplified.write.mode("overwrite").parquet(bronze_main_path)

# 4. Source 2 (Enrichissement)
enrich_data = [("electronics", "High-Tech", 0.20), ("appliances", "Home", 0.15), ("computers", "IT", 0.18)]
df_enrich = spark.createDataFrame(enrich_data, ["category_code_prefix", "category_department", "margin_rate"])
df_enrich.write.mode("overwrite").parquet(f"{PROJECT_ROOT}/data/bronze/enrich/static_ref.parquet")

# 5. Validation de la taille
size_gb = sum(f.size for f in dbutils.fs.ls(bronze_main_path) if f.name.endswith(".parquet")) / (1024**3)
print(f"âœ… BRONZE VALIDE : {size_gb:.2f} GB | Format: Parquet")

In [0]:
# On passe Ã  20 fois la base (1 initiale + 19 unions)
df_amplified = df_raw
for _ in range(19): 
    df_amplified = df_amplified.unionAll(df_raw)

# On rÃ©-Ã©crit par-dessus
df_amplified.write.mode("overwrite").parquet(bronze_main_path)

# On re-vÃ©rifie
size_gb = sum(f.size for f in dbutils.fs.ls(bronze_main_path) if f.name.endswith(".parquet")) / (1024**3)
print(f"ðŸš€ NOUVELLE TAILLE BRONZE : {size_gb:.2f} GB")

In [0]:
df_amplified.count()


## Zone SILVER

### Nettoyage MÃ©tier

In [0]:
from pyspark.sql.functions import col

# --- OPTIMISATION 1 : Column Pruning 
# On liste uniquement les colonnes utiles pour les analyses Gold et les checks qualitÃ©
needed_columns = [
    "event_time", "event_type", "product_id", "category_id", 
    "category_code", "brand", "price", "user_id", "user_session"
]

# Chargement optimisÃ©
df_bronze = spark.read.parquet(f"{PROJECT_ROOT}/data/bronze/main/full_data.parquet") \
                 .select(*needed_columns)

# Filtrage mÃ©tier
df_cleaned = df_bronze.filter(
    (col("user_id").isNotNull()) & 
    (col("product_id").isNotNull()) & 
    (col("price") > 0)
)

# --- OPTIMISATION 2 : Re-partitionnement 
df_cleaned = df_cleaned.repartition(col("event_type")) 

print(f"âœ… Nettoyage mÃ©tier terminÃ©.")
print(f"Lignes restantes : {df_cleaned.count()}")

### Standardisation (Formatage Propre)

In [0]:
from pyspark.sql.functions import lower, trim, col

# Standardisation des textes et types
df_silver = df_cleaned \
    .withColumn("event_type", lower(trim(col("event_type")))) \
    .withColumn("category_code", lower(trim(col("category_code")))) \
    .withColumn("brand", lower(trim(col("brand")))) \
    .withColumn("price", col("price").cast("double"))


print("âœ… Standardisation terminÃ©e. L'optimisation est gÃ©rÃ©e par le moteur Photon/Serverless.")

### Les 8 Checks QualitÃ© + Sauvegarde OptimisÃ©e

In [0]:
from pyspark.sql.functions import count, when, countDistinct, col

total_rows = df_silver.count()

# 1. CALCUL DES 8 INDICATEURS
quality_metrics = df_silver.select(
    ((count(when(col("user_id").isNotNull(), True)) / total_rows) * 100).alias("chk_1_user_complete"),
    ((count(when(col("price") > 0, True)) / total_rows) * 100).alias("chk_2_price_pos"),
    ((count(when(col("event_type").isNotNull(), True)) / total_rows) * 100).alias("chk_3_evt_present"),
    ((count(when(col("event_time").isNotNull(), True)) / total_rows) * 100).alias("chk_4_date_valid"),
    ((count(when(col("brand").isNotNull(), True)) / total_rows) * 100).alias("chk_5_brand_filled"),
    ((count(when(col("category_code").isNotNull(), True)) / total_rows) * 100).alias("chk_6_cat_filled"),
    ((countDistinct("user_session") / total_rows) * 100).alias("chk_7_unique_sessions"),
    ((count(when(col("user_session").rlike("^[0-9a-fA-F-]+"), True)) / total_rows) * 100).alias("chk_8_session_format")
)

print("ðŸ“Š RAPPORT DE QUALITÃ‰ (SILVER) :")
quality_metrics.show()

# 2. SAUVEGARDE DU RAPPORT 
quality_metrics.write.mode("overwrite").parquet(f"{PROJECT_ROOT}/reports/data_quality/silver_report.parquet")

#  3. SAUVEGARDE SILVER OPTIMISÃ‰E (Partitionnement)
# OPTIMISATION 3 : Partitionnement par 'event_type'
# Cela permet d'accÃ©lÃ©rer les analyses Gold qui filtrent par type d'action (view, cart, purchase)
silver_path = f"{PROJECT_ROOT}/data/silver/main_clean.parquet"

df_silver.write.mode("overwrite") \
    .partitionBy("event_type") \
    .parquet(silver_path)

print(f"âœ… DonnÃ©es Silver sauvegardÃ©es en Parquet avec partitionnement dans : {silver_path}")