# Ingestion de fichiers en tables Iceberg
Ce notebook parcourt r√©cursivement un dossier, lit chaque fichier (CSV ou Parquet) et cr√©e une table Iceberg dans un catalogue S3 (MinIO).

In [None]:
from pyspark.sql import SparkSession

# -----------------------------------------------------
# 1. D√©marrage de la session Spark
# -----------------------------------------------------
spark = SparkSession.builder.appName("IcebergNotebook").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
print("Spark session initialis√©e avec Iceberg et MinIO.")

In [None]:
# -----------------------------------------------------
# 2. Initialisation boto3 client MinIO
# -----------------------------------------------------

import boto3

s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",
    aws_access_key_id="minio",
    aws_secret_access_key="minio123",
    region_name="eu-west-1",
)

bucket = "raw"
prefix = "unzipped/"

print(f"Listing des fichiers dans s3://{bucket}/{prefix}")

response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

if "Contents" not in response:
    print("Aucun fichier trouv√©.")
    raise SystemExit

files = [obj["Key"] for obj in response["Contents"] if not obj["Key"].endswith("/")]

print("Fichiers trouv√©s :", files)


In [None]:
# -----------------------------------------------------
# 3. Ingestion dans Iceberg
# -----------------------------------------------------
spark.sql('CREATE NAMESPACE IF NOT EXISTS lakehouse.raw').show()
print(f"Namespace Iceberg cr√©√©e : raw")

for key in files:
    filename = key.split("/")[-1]

    if not (filename.endswith(".csv") or filename.endswith(".parquet")):
        print(f"Format non support√© : {filename}")
        continue

    table_name = filename.split(".")[0].replace("-", "_").replace(" ", "_")

    file_path = f"s3a://{bucket}/{key}"

    # Lecture Spark
    if filename.endswith(".csv"):
        df = spark.read.option("header", "true").csv(file_path)
    else:
        df = spark.read.parquet(file_path)

    full_table_name = f"lakehouse.raw.{table_name}"
    df.writeTo(full_table_name).createOrReplace()

    print(f"Table Iceberg cr√©√©e : {full_table_name}")
print("üéØ Toutes les op√©rations ont √©t√© ex√©cut√©es.")