# üõí Retail Data Quality Pipeline  
 
- G√©n√©ration de donn√©es synth√©tiques retail (magasins, produits, ventes, stocks)
- Envoi direct vers MinIO (bucket `retail-raw`)
- Validation avec Great Expectations
--- 
# Iceberg + Spark + Great Expectations + MinIO  
## üîó Stack :  
- ‚úÖ Donn√©es synth√©tiques (Faker)  (magasins, produits, ventes, stocks)
- ‚úÖ Stockage MinIO (`s3://retail-raw/`)  
- ‚úÖ Tables Iceberg (`retail.raw.*`)  
- ‚úÖ Validation qualit√© (GX + Spark)  
- ‚úÖ Dashboard qualit√© inline 

In [None]:
import os
import sys
import pandas as pd
import numpy as np
from faker import Faker
from faker.providers import BaseProvider
import random
from datetime import datetime, timedelta
import uuid
import boto3
from io import BytesIO
import pyarrow.parquet as pq
import pyarrow as pa


# üîê Configuration MinIO
MINIO_ENDPOINT = os.getenv("S3_ENDPOINT_URL", "http://minio:9000")
AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID", "minio")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "minio123")
BUCKET_RAW = "retail-raw"
BUCKET_RETAIL_LAKEHOUSE = "retail-lakehouse"

s3_client = boto3.client(
    "s3",
    endpoint_url=MINIO_ENDPOINT,
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
)

# Cr√©er buckets
for b in [BUCKET_RAW, BUCKET_RETAIL_LAKEHOUSE]:
    try:
        s3_client.head_bucket(Bucket=b)
    except:
        s3_client.create_bucket(Bucket=b)
        print(f"üÜï Bucket '{b}' cr√©√©.")

# üé≠ Faker + Retail Provider

In [None]:
fake = Faker("fr_FR")
Faker.seed(42)
random.seed(42)

class RetailProvider(BaseProvider):
    def product_category(self):
        return random.choices(
            ["√âpicerie", "√âlectronique", "Textile", "Maison", "Beaut√©"],
            weights=[30, 20, 20, 15, 15]
        )[0]

    def brand(self, category):
        brands = {
            "√âpicerie": ["Carrefour", "Monoprix", "BioCoop", "Lidl"],
            "√âlectronique": ["Apple", "Samsung", "Sony", "Logitech"],
            "Textile": ["Zara", "H&M", "Nike", "Adidas"],
        }
        return random.choice(brands.get(category, ["Generic"]))

fake.add_provider(RetailProvider)


# üõ†Ô∏è G√©n√©ration de donn√©es (abr√©g√©e pour rapidit√©)
print("üèóÔ∏è G√©n√©ration des donn√©es...")

# Magasins
stores = [{"store_id": i, "name": f"Magasin {fake.city()}", "city": fake.city(),
           "country": "France", "opening_date": fake.date_this_decade(),
           "surface_m2": random.randint(200, 5000)} for i in range(1, 11)]
df_stores = pd.DataFrame(stores)

# Produits
products = []
for i in range(1, 101):  # 100 produits pour rapidit√©
    cat = fake.product_category()
    products.append({
        "product_id": i,
        "name": fake.catch_phrase()[:30],
        "category": cat,
        "brand": fake.brand(cat),
        "cost_price": round(random.uniform(1, 100), 2),
        "list_price": round(random.uniform(max(2, products[-1]["cost_price"]*1.1) if products else 5, 200), 2),
    })
df_products = pd.DataFrame(products)

# üë• G√©n√©ration des employ√©s
print("üë∑ G√©n√©ration des employ√©s...")
employees = []
for i in range(1, 31):  # 30 employ√©s
    store_id = random.randint(1, 10)  # r√©partis sur les 10 magasins
    employees.append({
        "employee_id": i,
        "store_id": store_id,
        "first_name": fake.first_name(),
        "last_name": fake.last_name(),
        "hire_date": fake.date_between(start_date="-3y", end_date="today"),
        "job_title": random.choice(["Vendeur", "Caissier", "Chef de rayon", "Responsable magasin"])
    })
df_employees = pd.DataFrame(employees)

# %%
# üõ†Ô∏è G√©n√©ration am√©lior√©e ‚Äî pour dashboards parlants
print("üèóÔ∏è G√©n√©ration de donn√©es r√©alistes (7 jours, comportements m√©tier)...")

from datetime import timedelta

# ‚Üí 7 jours de ventes (2025-11-28 √† 2025-12-04)
base_date = datetime(2025, 11, 28).date()
sales = []

for day_offset in range(7):
    sale_date = base_date + timedelta(days=day_offset)
    
    # Plus de ventes le week-end (vendredi/samedi/dimanche)
    daily_volume = 300 if day_offset in [4, 5, 6] else 150  # ven/sam/dim = +100%
    
    for _ in range(daily_volume):
        p = random.choice(products)
        store_id = random.randint(1, 10)
        eligible_emps = [e for e in employees if e["store_id"] == store_id]
        emp = random.choice(eligible_emps) if eligible_emps else employees[0]
        
        # üîë Comportements r√©alistes :
        # - Remises plus fortes en fin de semaine
        # - Certains produits (ex: "√âpicerie") ont marge faible
        # - Certains vendeurs font plus d'erreurs
        
        # 1. Base discount
        base_discount = 0.0
        if day_offset in [4, 5, 6]:  # week-end
            base_discount = 0.15  # 15% en moyenne
            
        # 2. Produits "low margin" ‚Üí √âpicerie
        if p["category"] == "√âpicerie":
            cost_ratio = random.uniform(0.85, 0.95)  # marge faible
        else:
            cost_ratio = random.uniform(0.4, 0.7)     # marge normale/haute
        
        # 3. Vendeurs "novices" (ex: Vendeur/Caissier) ‚Üí plus d'erreurs
        is_novice = emp["job_title"] in ["Vendeur", "Caissier"]
        if is_novice and random.random() < 0.2:  # 20% de risque
            price_ratio = random.uniform(cost_ratio * 0.7, cost_ratio)  # prix < co√ªt !
        else:
            price_ratio = random.uniform(cost_ratio, 1.0)
        
        # Calcul final
        unit_price = round(p["list_price"] * (1 - base_discount) * price_ratio, 2)
        quantity = random.choices([1, 2, 3, 4], weights=[60, 25, 10, 5])[0]  # + gros volumes
        
        sales.append({
            "sale_id": len(sales) + 1,
            "store_id": store_id,
            "product_id": p["product_id"],
            "employee_id": emp["employee_id"],
            "quantity": quantity,
            "unit_price": unit_price,
            "sale_date": sale_date,
        })

df_sales = pd.DataFrame(sales)
print(f"‚úÖ {len(df_sales):,} ventes g√©n√©r√©es sur 7 jours (dont week-end boost√©).")

# üì¶ Inventaire (snapshots journaliers)
inventory = []
for day_offset in range(8):  # 8 jours (27 nov ‚Üí 4 d√©c)
    snapshot_date = base_date + timedelta(days=day_offset - 1)
    for p in products:
        base_stock = random.randint(5, 50)
        # Vente = r√©duction stock
        sold = len([s for s in sales if s["product_id"] == p["product_id"] and s["sale_date"] <= snapshot_date])
        qty = max(0, base_stock - sold // 2)  # simplifi√©
        inventory.append({
            "inventory_id": str(uuid.uuid4()),
            "product_id": p["product_id"],
            "quantity": qty,
            "last_updated": snapshot_date,
        })

df_inventory = pd.DataFrame(inventory)

print(f"‚úÖ {len(df_stores)} magasins, {len(df_products)} produits, {len(df_sales)} ventes, {len(df_employees)} employ√©s g√©n√©r√©s.")


# üì§ Upload vers MinIO
def upload_df_to_minio(df, key):
    buf = BytesIO()
    pq.write_table(pa.Table.from_pandas(df), buf)
    buf.seek(0)
    s3_client.put_object(Bucket=BUCKET_RAW, Key=key, Body=buf.getvalue())
    print(f"üì§ {key} ‚Üí s3://{BUCKET_RAW}")

upload_df_to_minio(df_stores, "stores.parquet")
upload_df_to_minio(df_products, "products.parquet")
upload_df_to_minio(df_sales, "sales.parquet")
upload_df_to_minio(df_employees, "employees.parquet")
upload_df_to_minio(df_inventory, "inventory.parquet")


# üöÄ Initialisation Spark + Iceberg

In [None]:
print("üîß D√©marrage de Spark avec Iceberg...")
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("RetailGX")
    .getOrCreate()
)

print("‚úÖ Spark + Iceberg pr√™t.")

# üóÉÔ∏è Cr√©ation base & ingestion Iceberg
spark.sql("CREATE NAMESPACE IF NOT EXISTS retail.raw")
print("‚úÖ Namespace retail cr√©√©.")


(
    spark.read.parquet(f"s3a://{BUCKET_RAW}/stores.parquet")
    .writeTo("retail.raw.stores")
    .using("iceberg")
    .createOrReplace()
)
print("‚úÖ Tables Iceberg retail.raw.stores cr√©√©es.")


(
    spark.read.parquet(f"s3a://{BUCKET_RAW}/products.parquet")
    .writeTo("retail.raw.products")
    .using("iceberg")
    .createOrReplace()
)
print("‚úÖ Tables Iceberg retail.raw.products cr√©√©es.")


(
    spark.read.parquet(f"s3a://{BUCKET_RAW}/sales.parquet")
    .writeTo("retail.raw.sales")
    .using("iceberg")
    .createOrReplace()
)
print("‚úÖ Tables Iceberg retail.raw.sales cr√©√©es.")


(
    spark.read.parquet(f"s3a://{BUCKET_RAW}/employees.parquet")
    .writeTo("retail.raw.employees")
    .using("iceberg")
    .createOrReplace()
)
print("‚úÖ Tables Iceberg retail.raw.employees cr√©√©es.")

(
    spark.read.parquet(f"s3a://{BUCKET_RAW}/inventory.parquet")
    .writeTo("retail.raw.inventory")
    .using("iceberg")
    .createOrReplace()
)
print("‚úÖ Tables Iceberg retail.raw.inventory cr√©√©es.")

print("‚úÖ Tables Iceberg cr√©√©es.")

# üéâ Fin

In [None]:

spark.stop()
print("\nüéâ Pipeline termin√© !")
print(f"‚û°Ô∏è  Donn√©es brutes : s3://{BUCKET_RAW}/")
print(f"‚û°Ô∏è  Tables Iceberg : retail.raw.*")
print(f"‚û°Ô∏è  Dashboard Superset : connecte-toi √† Trino ‚Üí `retail.raw`")