## 1. Ingestion (Bronze)

In [1]:
# import
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from mysql.connector import errorcode
from tools.database import DatabaseManager
from datetime import datetime


In [2]:
start_time = datetime.now

# --- CONFIGURATION SYST√àME (Obligatoire pour Windows) ---
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# D√©sactivation temporaire du recyclage des workers (√©vite les erreurs de sockets sur Windows)
os.environ['PYSPARK_PYTHON_WORKER_REUSE'] = "0"

# --- PR√âPARATION MYSQL ---
db_tools = DatabaseManager(user="root", password="root")
db_tools.setup_database("openfood_db")
jdbc_url, connection_props = db_tools.get_jdbc_params("openfood_db")

üóëÔ∏è Ancienne base 'openfood_db' supprim√©e.
‚úÖ Base de donn√©es 'openfood_db' cr√©√©e √† neuf.
‚úÖ Table 'silver_products' pr√™te dans 'openfood_db'.


In [3]:
# --- INITIALISATION SPARK ---
# --- CHEMINS ---
jdbc_jar = r".\driver\mysql-connector-j-8.0.33\mysql-connector-j-8.0.33.jar"

# --- INITIALISATION SESSION ---
spark = SparkSession.builder \
    .appName("OpenFoodFacts_ETL") \
    .master("local[1]") \
    .config("spark.driver.host", "127.0.0.1") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.python.worker.timeout", "120") \
    .config("spark.jars", jdbc_jar) \
    .config("spark.driver.extraClassPath", jdbc_jar) \
    .config("spark.executor.extraClassPath", jdbc_jar) \
    .getOrCreate()

print("Session Spark cr√©√©e avec succ√®s !")

# --- TEST DE DIAGNOSTIC ---
try:
    print("Test de communication Python-Java...")
    spark.createDataFrame([(1, "test")], ["id", "val"]).collect()
    print("‚úÖ Communication OK !")
except Exception as e:
    print("‚ùå √âchec de communication interne :", e)

Session Spark cr√©√©e avec succ√®s !
Test de communication Python-Java...
‚úÖ Communication OK !


In [4]:
csv_path = "./data/donnees_echantillon.csv"

bronze_df = spark.read \
    .option("header", "true") \
    .option("sep", "\t") \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("multiLine", "true") \
    .option("mode", "PERMISSIVE") \
    .csv(csv_path)

bronze_df.printSchema()
print(bronze_df.head())

root
 |-- code: string (nullable = true)
 |-- url: string (nullable = true)
 |-- creator: string (nullable = true)
 |-- created_t: string (nullable = true)
 |-- created_datetime: string (nullable = true)
 |-- last_modified_t: string (nullable = true)
 |-- last_modified_datetime: string (nullable = true)
 |-- last_modified_by: string (nullable = true)
 |-- last_updated_t: string (nullable = true)
 |-- last_updated_datetime: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- abbreviated_product_name: string (nullable = true)
 |-- generic_name: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- packaging: string (nullable = true)
 |-- packaging_tags: string (nullable = true)
 |-- packaging_en: string (nullable = true)
 |-- packaging_text: string (nullable = true)
 |-- brands: string (nullable = true)
 |-- brands_tags: string (nullable = true)
 |-- brands_en: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- categories_tags: s

In [5]:
bronze_df.show(10)

+-----------+--------------------+---------------+----------+--------------------+---------------+----------------------+-----------------+--------------+---------------------+--------------------+------------------------+------------+--------+---------+--------------+------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------------+----------+--------------------+-------------------------+--------------------+--------------------+--------------------+---------+--------------+------------------------+------+-----------+---------------+---------------+----------------+--------------------+---------------+--------------------+--------------------+-------------------------+---------+------------+------+-----------+---------+----------------+----------------+-----------------+-----------+---------+--------------------+--------------------+----------------+----------------+----------

In [6]:
print("=== Nombre de lignes ===")
print(bronze_df.count())

=== Nombre de lignes ===
418676


In [7]:
print("=== Nombre de colonnes ===")
print(len(bronze_df.columns))

=== Nombre de colonnes ===
215


In [8]:
print("=== Sch√©ma inf√©r√© ===")
bronze_df.printSchema()

=== Sch√©ma inf√©r√© ===
root
 |-- code: string (nullable = true)
 |-- url: string (nullable = true)
 |-- creator: string (nullable = true)
 |-- created_t: string (nullable = true)
 |-- created_datetime: string (nullable = true)
 |-- last_modified_t: string (nullable = true)
 |-- last_modified_datetime: string (nullable = true)
 |-- last_modified_by: string (nullable = true)
 |-- last_updated_t: string (nullable = true)
 |-- last_updated_datetime: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- abbreviated_product_name: string (nullable = true)
 |-- generic_name: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- packaging: string (nullable = true)
 |-- packaging_tags: string (nullable = true)
 |-- packaging_en: string (nullable = true)
 |-- packaging_text: string (nullable = true)
 |-- brands: string (nullable = true)
 |-- brands_tags: string (nullable = true)
 |-- brands_en: string (nullable = true)
 |-- categories: string (nullable = true

## 2. Nettoyage & qualit√© (Silver)

In [9]:
# --- 1. LISTING DES COLONNES NECESSAIRE ---
cols_needed = [
    "code", "product_name", "brands", "main_category","categories_en", "countries_en",
    "last_modified_t", "nutriscore_grade", 
    "energy-kcal_100g", "fat_100g", "saturated-fat_100g", "sugars_100g", 
    "salt_100g", "proteins_100g", "fiber_100g", "sodium_100g", "completeness"
]

# On filtre les colonnes pour ne prendre que celles qui existent r√©ellement dans le fichier source
existing_cols = [c for c in cols_needed if c in bronze_df.columns]
silver_df = bronze_df.select(*existing_cols)

# --- 2. RENOMMAGE (Suppression des tirets pour MySQL) ---
for c in silver_df.columns:
    if "-" in c:
        silver_df = silver_df.withColumnRenamed(c, c.replace("-", "_"))

# --- VERIFICATION ---
print(f"=== Nombre de colonnes : {len(silver_df.columns)} ===")
print(f"=== Nombre de lignes : {silver_df.count()} ===")
print(silver_df.columns)
silver_df.select("main_category").show(20)

=== Nombre de colonnes : 17 ===
=== Nombre de lignes : 418676 ===
['code', 'product_name', 'brands', 'main_category', 'categories_en', 'countries_en', 'last_modified_t', 'nutriscore_grade', 'energy_kcal_100g', 'fat_100g', 'saturated_fat_100g', 'sugars_100g', 'salt_100g', 'proteins_100g', 'fiber_100g', 'sodium_100g', 'completeness']
+--------------------+
|       main_category|
+--------------------+
|en:fruit-and-vege...|
|                null|
|        en:beverages|
|                null|
|                null|
|         en:biscuits|
|                null|
|        en:undefined|
|                null|
|           en:breads|
|                null|
|            en:sodas|
|                null|
|                null|
|                null|
|en:plant-based-be...|
|en:fruits-and-veg...|
|                null|
|   en:salmon-fillets|
|                null|
+--------------------+
only showing top 20 rows



In [10]:
from pyspark.sql.functions import col, regexp_replace, lower, trim, udf
import unicodedata
from pyspark.sql.types import StringType

# Liste des colonnes √† ne PAS toucher
cols_to_exclude = ["countries_en", "main_category", "categories_en"]

# 1. Pr√©paration de l'UDF pour la normalisation ASCII
clean_ascii_udf = udf(
    lambda text: 
        unicodedata.normalize('NFD', unicodedata.normalize('NFKC', text))
        .encode('ascii', 'ignore')
        .decode('utf-8') if text is not None else None, 
    StringType()
)


# 2. Identification des colonnes texte dans silver_df
string_cols = [c for c, t in silver_df.dtypes if t == "string" and c not in cols_to_exclude]

# 3. Application du bloc de nettoyage complet
for col_name in string_cols:
    silver_df = silver_df.withColumn(col_name, 
        # Suppression des espaces multiples et mise en forme finale
        regexp_replace(
            lower(
                trim(
                    # Filtre alphanum√©rique (garde lettres, chiffres et espaces)
                    regexp_replace(
                        # Normalisation ASCII (via l'UDF)
                        clean_ascii_udf(col(col_name)), 
                        "[^a-zA-Z0-9 ]", ""
                    )
                )
            ), 
            "\\s+", " "
        )
    )

silver_df.show(50)

+-----------+--------------------+--------------------+--------------------+--------------------+---------------+---------------+----------------+----------------+----------------+------------------+--------------+---------------+---------------+---------------+----------------+------------+
|       code|        product_name|              brands|       main_category|       categories_en|   countries_en|last_modified_t|nutriscore_grade|energy_kcal_100g|        fat_100g|saturated_fat_100g|   sugars_100g|      salt_100g|  proteins_100g|     fiber_100g|     sodium_100g|completeness|
+-----------+--------------------+--------------------+--------------------+--------------------+---------------+---------------+----------------+----------------+----------------+------------------+--------------+---------------+---------------+---------------+----------------+------------+
|11110022844|kroger squeezable...|              kroger|en:fruit-and-vege...|Plant-based foods...|  United States|     158

In [None]:
silver_df = silver_df.withColumn(
    "countries_en",
    split(col("countries_en"), ",\s*")
)

In [12]:
silver_df = silver_df.withColumn("main_category", 
    regexp_replace(
        split(col("main_category"), "-").getItem(0), 
        "^[a-z]{2}:", ""
    )
)

cols = ["countries_en", "main_category", "categories_en"]

for col_name in cols:
    silver_df = silver_df.withColumn(col_name,
        clean_ascii_udf(lower(trim(col(col_name)))))

In [13]:
# --- TRAITEMENT DES COLONNES ---
from pyspark.sql.functions import col, when, lower, lit, coalesce, substring

# --- CONFIGURATION ---
text_columns = [
    ("categories_en", "categories", "non classe"),
    ("main_category", None, "non classe"),
    ("brands", None, "marque inconnue"),
    ("countries_en", None, "pays inconue"),
    ("nutriscore_grade", None, "non classe"),
]

invalid_vals = ["undefined", "null", "unknown", "none", "n/a", ""]

# --- TRAITEMENT ---
for main_col, fallback_col, default_val in text_columns:
    if main_col in silver_df.columns:
        
        # 1. D√©finition de la condition de validit√© (r√©utilisable)
        def get_valid_col(c):
            return when((col(c).isNotNull()) & (~lower(col(c)).isin(invalid_vals)), col(c))

        # 2. Logique de remplacement (Coalesce g√®re la priorit√© : Main > Fallback > Default)
        if fallback_col and fallback_col in silver_df.columns:
            silver_df = silver_df.withColumn(
                main_col,
                coalesce(get_valid_col(main_col), get_valid_col(fallback_col), lit(default_val))
            ).drop(fallback_col)
        else:
            silver_df = silver_df.withColumn(
                main_col,
                coalesce(get_valid_col(main_col), lit(default_val))
            )

        # 3. Troncature √† 255 caract√®res
        silver_df = silver_df.withColumn(main_col, substring(col(main_col), 1, 255))

In [14]:
# --- 4. D√âDUPLICATION FINALE ---
if "code" in silver_df.columns:
    # On trie par date de modification d√©croissante (la plus r√©cente en premier)
    # Puis on supprime les doublons bas√©s sur le code-barres
    silver_df = silver_df.orderBy(col("last_modified_t").desc()) \
                         .dropDuplicates(["code"])

print("‚úÖ Nettoyage termin√© : cat√©gories simplifi√©es et donn√©es normalis√©es.")

‚úÖ Nettoyage termin√© : cat√©gories simplifi√©es et donn√©es normalis√©es.


In [15]:
# On d√©finit des seuils biologiques/physiques pour chaque nutriment.
# Si une valeur d√©passe ces bornes (ex: > 100g de sucre pour 100g de produit), 
# on la remplace par NULL pour ne pas fausser les moyennes statistiques plus tard.

# 1. Filtrage par seuils
nutrient_bounds = {
    "energy_kcal_100g": (0, 1000),
    "fat_100g": (0, 100),
    "saturated_fat_100g": (0, 100),
    "sugars_100g": (0, 100),
    "salt_100g": (0, 100),
    "proteins_100g": (0, 100),
    "fiber_100g": (0, 100),
    "sodium_100g": (0, 40),
    "completeness": (0, 1)
}

for col_name, (min_val, max_val) in nutrient_bounds.items():
    if col_name in silver_df.columns:
         silver_df = silver_df.withColumn(
            col_name,
            when((col(col_name) >= min_val) & (col(col_name) <= max_val), col(col_name))
            .otherwise(None)
         )

# 2. Calcul des colonnes d'estimation
if "sodium_100g" in silver_df.columns:
    silver_df = silver_df.withColumn("salt_est", col("sodium_100g") * 2.5)

if "salt_100g" in silver_df.columns:
    silver_df = silver_df.withColumn("sodium_est", col("salt_100g") / 2.5)

# 3. Remplissage des donn√©es manquantes
if "salt_100g" in silver_df.columns and "salt_est" in silver_df.columns:
    silver_df = silver_df.withColumn("salt_100g", coalesce(col("salt_100g"), col("salt_est")))

if "sodium_100g" in silver_df.columns and "sodium_est" in silver_df.columns:
    silver_df = silver_df.withColumn("sodium_100g", coalesce(col("sodium_100g"), col("sodium_est")))

# 4. Conversion kcal ‚Üí kJ
if "energy_kcal_100g" in silver_df.columns:
    silver_df = silver_df.withColumn("energy_kj_100g", col("energy_kcal_100g") * 4.184)

# 5. Formatage final (Arrondi)
# On ajoute energy_kj_100g √† la liste des colonnes √† arrondir
cols_to_round = list(nutrient_bounds.keys()) + ["energy_kj_100g"]
for col_name in cols_to_round: 
    if col_name in silver_df.columns: 
        silver_df = silver_df.withColumn(col_name, round(col(col_name), 1))

# 6. Nettoyage
silver_df = silver_df.drop("salt_est", "sodium_est")

silver_df.show(5)

+-------------+--------------------+------------------+-------------+--------------------+-------------+---------------+----------------+----------------+--------+------------------+-----------+---------+-------------+----------+-----------+------------+--------------+
|         code|        product_name|            brands|main_category|       categories_en| countries_en|last_modified_t|nutriscore_grade|energy_kcal_100g|fat_100g|saturated_fat_100g|sugars_100g|salt_100g|proteins_100g|fiber_100g|sodium_100g|completeness|energy_kj_100g|
+-------------+--------------------+------------------+-------------+--------------------+-------------+---------------+----------------+----------------+--------+------------------+-----------+---------+-------------+----------+-----------+------------+--------------+
|0891039000808|        meatball sub| lunch box kitchen|   non classe|          non classe|united states|     1750104771|      non classe|            null|    null|              null|       n

In [16]:
# convertion du unix en timestamp et on garde uniquement que la date

if "last_modified_t" in silver_df.columns:
    silver_df = silver_df.withColumn(
        "last_modified_ts",
        to_timestamp(col("last_modified_t").cast("double"))
    ).withColumn(
        "last_modified_date",
        to_date(col("last_modified_ts"))
    )

silver_df.select("last_modified_date").show(50, truncate=False)

+------------------+
|last_modified_date|
+------------------+
|2025-10-09        |
|2025-06-16        |
|2020-04-22        |
|2020-04-22        |
|2020-04-22        |
|2025-10-02        |
|2020-04-22        |
|2024-06-19        |
|2024-06-19        |
|2023-05-24        |
|2025-05-20        |
|2025-05-28        |
|2022-12-21        |
|2025-06-10        |
|2025-05-20        |
|2025-10-10        |
|2025-05-31        |
|2022-03-04        |
|2024-09-14        |
|2025-06-01        |
|2020-04-22        |
|2024-08-12        |
|2023-08-28        |
|2023-08-28        |
|2024-11-08        |
|2020-04-23        |
|2020-04-23        |
|2023-04-28        |
|2023-04-10        |
|2025-09-09        |
|2023-01-01        |
|2024-11-12        |
|2025-08-14        |
|2025-05-28        |
|2023-01-05        |
|2022-10-05        |
|2020-04-22        |
|2025-11-21        |
|2022-04-23        |
|2021-04-26        |
|2020-04-23        |
|2020-04-23        |
|2024-06-19        |
|2024-06-19        |
|2024-06-19  

In [17]:
# 1. On supprime les doublons
w = Window.partitionBy("code").orderBy(col("last_modified_t").cast("long").desc())

silver_dedup = silver_df.withColumn("rn", row_number().over(w)) \
    .filter(col("rn") == 1) \
    .drop("rn")

# 2. On enl√®ve les NULL et les cha√Ænes de caract√®res vides
silver_final = silver_dedup.filter(
    (col("code").isNotNull()) & 
    (col("code") != "") & 
    (col("code") != "null")
)

# 3. Petit rapport de qualit√©
print(f"Lignes avant nettoyage : {silver_dedup.count()}")
print(f"Lignes apr√®s nettoyage (code valide) : {silver_final.count()}")

Lignes avant nettoyage : 418651
Lignes apr√®s nettoyage (code valide) : 418651


In [18]:
silver_final.show(200)

+-------------+--------------------+--------------------+---------------+--------------------+--------------------+---------------+----------------+----------------+--------+------------------+-----------+---------+-------------+----------+-----------+------------+--------------+-------------------+------------------+
|         code|        product_name|              brands|  main_category|       categories_en|        countries_en|last_modified_t|nutriscore_grade|energy_kcal_100g|fat_100g|saturated_fat_100g|sugars_100g|salt_100g|proteins_100g|fiber_100g|sodium_100g|completeness|energy_kj_100g|   last_modified_ts|last_modified_date|
+-------------+--------------------+--------------------+---------------+--------------------+--------------------+---------------+----------------+----------------+--------+------------------+-----------+---------+-------------+----------+-----------+------------+--------------+-------------------+------------------+
|0891039000808|        meatball sub|   l

In [19]:
# Initialisation (la connexion self.conn se cr√©e ici)
db_tools = DatabaseManager(user="root", password="root")

# 2. Param√®tres pour Spark
jdbc_url, connection_props = db_tools.get_jdbc_params("openfood_db")

# 3. √âcriture (via Spark JDBC)
try:
    # On force le mode "overwrite" pour √©craser la table si elle existe d√©j√†
    silver_final.write.jdbc(
        url=jdbc_url,
        table="silver_products",
        mode="overwrite", 
        properties=connection_props
    )
    print("‚úÖ Transfert Spark vers MySQL r√©ussi !")
except Exception as e:
    print(f"‚ùå Erreur Spark JDBC : {e}")

‚úÖ Transfert Spark vers MySQL r√©ussi !


## 3. Dimensions (Gold)

In [20]:
# --- DIM_TIME ---
df_dim_time = silver_final.select("last_modified_t").distinct() \
    .withColumn("ts_date", from_unixtime(col("last_modified_t")).cast("date")) \
    .select(
        col("last_modified_t").alias("time_sk"),
        col("ts_date").alias("date"),
        year("ts_date").alias("year"),
        month("ts_date").alias("month"),
        dayofmonth("ts_date").alias("day"),
        weekofyear("ts_date").alias("week"),
        weekofyear("ts_date").alias("iso_week")
    )

# --- DIM_BRAND ---
df_dim_brand = silver_final.select(col("brands").alias("brand_name")) \
    .filter(col("brand_name").isNotNull() & (col("brand_name") != "")) \
    .distinct() \
    .withColumn("brand_name", substring(col("brand_name"), 1, 500))

# --- DIM_CATEGORY ---
df_dim_category = silver_final.select(
    substring(lower(trim(col("categories_en"))), 1, 500).alias("category_name"),
    substring(lower(trim(col("main_category"))), 1, 500).alias("parent_category_sk")
) \
.filter(col("category_name").isNotNull() & (col("category_name") != "")) \
.dropDuplicates(["category_name"]) # <--- Unicit√© garantie pour MySQL

# --- DIM_COUNTRY ---
df_dim_country = silver_final.select(
    col("countries_en").alias("countries_name")) \
.filter(col("countries_name").isNotNull())

In [21]:
# 1. Configuration JDBC
jdbc_url, connection_props = db_tools.get_jdbc_params("openfood_db")

try:
    # --- NETTOYAGE ---
    print("üßπ Truncate des tables Gold...")
    # On utilise une connexion directe pour le truncate
    conn = db_tools._get_connection("openfood_db")
    cursor = conn.cursor()
    cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")
    tables = ["dim_time", "dim_brand", "dim_category", "dim_country"]
    for t in tables: cursor.execute(f"TRUNCATE TABLE {t};")
    cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")
    conn.commit()
    cursor.close()
    
    # --- ETAPE 1 : Dimensions Ind√©pendantes ---
    print("üöÄ Insertion des dimensions de base (Time, Brand, Category, Country)...")
    df_dim_time.write.jdbc(url=jdbc_url, table="dim_time", mode="append", properties=connection_props)
    df_dim_brand.write.jdbc(url=jdbc_url, table="dim_brand", mode="append", properties=connection_props)
    df_dim_category.write.jdbc(url=jdbc_url, table="dim_category", mode="append", properties=connection_props)
    df_dim_country.write.jdbc(url=jdbc_url, table="dim_country", mode="append", properties=connection_props)

except Exception as e:
    print(f"‚ùå Erreur critique lors de l'alimentation Gold : {e}")

üßπ Truncate des tables Gold...
üöÄ Insertion des dimensions de base (Time, Brand, Category, Country)...
‚ùå Erreur critique lors de l'alimentation Gold : An error occurred while calling o751.jdbc.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 101.0 failed 1 times, most recent failure: Lost task 0.0 in stage 101.0 (TID 58) (pc_portable_val executor driver): java.sql.BatchUpdateException: Data truncation: Invalid JSON text: "Invalid value." at position 0 in value for column 'dim_country.countries_name'.
	at com.mysql.cj.jdbc.exceptions.SQLError.createBatchUpdateException(SQLError.java:224)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:816)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchInternal(ClientPreparedStatement.java:418)
	at com.mysql.cj.jdbc.StatementImpl.executeBatch(StatementImpl.java:795)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.s

In [22]:
jdbc_url, connection_props = db_tools.get_jdbc_params("openfood_db")
df_mysql_brand = spark.read.jdbc(url=jdbc_url, table="dim_brand", properties=connection_props) 
df_mysql_category = spark.read.jdbc(url=jdbc_url, table="dim_category", properties=connection_props) 

# --- 2. Construction de DIM_PRODUCT ---
df_dim_product = silver_final.select(
    "code", 
    "product_name", 
    "brands", 
    "main_category", 
    col("countries_en").alias("countries_multi_name") # Retrait du 's' ici
)

# On r√©cup√®re les SK des marques
df_dim_product = df_dim_product.join(
    df_mysql_brand.select("brand_sk", "brand_name"),
    df_dim_product.brands == df_mysql_brand.brand_name,
    "left"
)

# On r√©cup√®re les SK des cat√©gories
df_dim_product = df_dim_product.join(
    df_mysql_category.select("category_sk", "category_name"),
    df_dim_product.main_category == df_mysql_category.category_name,
    "left"
)

# S√©lection finale
df_dim_product_final = df_dim_product.select(
    col("code"),
    col("product_name"),
    col("brand_sk"),
    col("category_sk").alias("primary_category_sk"),
    col("countries_multi_name") # Plus besoin d'alias ici, le nom est d√©j√† bon
)

In [23]:
jdbc_url, connection_props = db_tools.get_jdbc_params("openfood_db")

try:
    # --- NETTOYAGE ---
    print("üßπ Truncate des tables Gold...")
    # On utilise une connexion directe pour le truncate
    conn = db_tools._get_connection("openfood_db")
    cursor = conn.cursor()
    cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")
    tables = ["dim_product"]
    for t in tables: cursor.execute(f"TRUNCATE TABLE {t};")
    cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")
    conn.commit()
    cursor.close()
    
    # --- ETAPE 1 : Dimensions Ind√©pendantes ---
    print("üöÄ Insertion de la dimension de Product")
    df_dim_product_final.write.jdbc(url=jdbc_url, table="dim_product", mode="append", properties=connection_props)

except Exception as e:
    print(f"‚ùå Erreur critique lors de l'alimentation Gold : {e}")

üßπ Truncate des tables Gold...
üöÄ Insertion de la dimension de Product
‚ùå Erreur critique lors de l'alimentation Gold : An error occurred while calling o803.jdbc.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 114.0 failed 1 times, most recent failure: Lost task 0.0 in stage 114.0 (TID 66) (pc_portable_val executor driver): java.sql.BatchUpdateException: Data truncation: Invalid JSON text: "Invalid value." at position 0 in value for column 'dim_product.countries_multi_name'.
	at com.mysql.cj.jdbc.exceptions.SQLError.createBatchUpdateException(SQLError.java:224)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchSerially(ClientPreparedStatement.java:816)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeBatchInternal(ClientPreparedStatement.java:418)
	at com.mysql.cj.jdbc.StatementImpl.executeBatch(StatementImpl.java:795)
	at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.savePartition(JdbcUtils.scala:735)
	at org.apache.s

In [24]:
# On recharge la table dim_product depuis MySQL pour avoir les product_sk
df_mysql_product = spark.read.jdbc(url=jdbc_url, table="dim_product", properties=connection_props) 

# --- 3. Construction de FACT_NUTRITION_SNAPSHOT ---
df_fact = silver_final.select(
    "code", "last_modified_t", "energy_kcal_100g", "fat_100g", "saturated_fat_100g",
    "sugars_100g", "salt_100g", "proteins_100g", "fiber_100g", "sodium_100g",
    "nutriscore_grade", "completeness"
)

df_fact_final = df_fact.join(
    df_mysql_product.select("product_sk", "code"),
    "code",
    "inner"
).select(
    col("product_sk"),
    col("last_modified_t").alias("time_sk"),
    "energy_kcal_100g", "fat_100g", "saturated_fat_100g", "sugars_100g", 
    "salt_100g", "proteins_100g", "fiber_100g", "sodium_100g", 
    "nutriscore_grade", 
    col("completeness").alias("completeness_score")
)

In [None]:
jdbc_url, connection_props = db_tools.get_jdbc_params("openfood_db")

try:
    # --- NETTOYAGE ---
    print("üßπ Truncate des tables Gold...")
    # On utilise une connexion directe pour le truncate
    conn = db_tools._get_connection("openfood_db")
    cursor = conn.cursor()
    cursor.execute("SET FOREIGN_KEY_CHECKS = 0;")
    tables = ["fact_nutrition_snapshot"]
    for t in tables: cursor.execute(f"TRUNCATE TABLE {t};")
    cursor.execute("SET FOREIGN_KEY_CHECKS = 1;")
    conn.commit()
    cursor.close()
    
    # --- ETAPE 1 : Dimensions Ind√©pendantes ---
    print("üöÄ Insertion de la table de fait")
    df_fact_final.write.jdbc(url=jdbc_url, table="fact_nutrition_snapshot", mode="append", properties=connection_props)

    print("‚ú® ARCHITECTURE GOLD TERMINEE AVEC SUCC√àS !")

    end_time = datetime.now()

except Exception as e:
    print(f"‚ùå Erreur critique lors de l'alimentation Gold : {e}")

üßπ Truncate des tables Gold...
üöÄ Insertion de la table de fait


In [None]:
metrics = {
    "source": "OpenFoodFacts CSV",
    "timestamp": datetime.now().isoformat(),
    "duree_minutes": (end_time - start_time).seconds / 60,
    "nombre_lignes_initial": bronze_df.count(),
    "nombre_lignes_traitees": silver_final.count(),
    "nombre_lignes_rejetees": bronze_df.count() - silver_final.count(),
    "taux_completude_moyen": df_fact.agg({"completeness_score": "avg"}).first()[0],
    "pct_nutriscore": silver_final.filter(col("nutriscore_grade").isNotNull()).count() / silver_final.count(),
    "nb_sugars_anomalies": silver_final.filter(col("sugars_100g") > 100).count(),
    "status": "SUCCESS"
}

In [None]:
import json
import os
from datetime import datetime

output_dir = "./metrics"
os.makedirs(output_dir, exist_ok=True)

# file name with timestamp
run_ts = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"{output_dir}/metrics_{run_ts}.json"

# write JSON
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

print(f"Metrics saved in : {output_path}")

In [None]:
spark.stop()