In [1]:
from pyspark.sql import SparkSession

def list_bronze_files():
    # Initialize Spark Session with necessary packages for S3/MinIO
    spark = SparkSession.builder \
        .appName("List Bronze Files") \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.4.1,com.amazonaws:aws-java-sdk-bundle:1.12.772") \
        .config("spark.hadoop.fs.s3a.endpoint", "http://127.0.0.1:9000") \
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .getOrCreate()

    # Set log level to WARN to reduce noise
    spark.sparkContext.setLogLevel("WARN")

    print("Listing files in bronze layer...")

    try:
        # Read all files in the bronze bucket
        df = spark.read.format("binaryFile").load("s3a://bronze/")
        
        # Extract just the file names from the full path
        from pyspark.sql.functions import split, element_at
        
        df_files = df.select(
            element_at(split(df["path"], "/"), -1).alias("filename")
        )
        
        # Collect the file names
        files = df_files.collect()
        
        print(f"\nFound {len(files)} file(s) in bronze layer:\n")
        for file in files:
            print(f"  - {file['filename']}")
        
        # Also write to a file
        with open("bronze_files_list.txt", "w") as f:
            f.write(f"Files in bronze layer ({len(files)} total):\n\n")
            for file in files:
                f.write(f"{file['filename']}\n")
        
        print("\nFile list also saved to bronze_files_list.txt")
        
    except Exception as e:
        print(f"Error reading from MinIO: {str(e)}")

    finally:
        spark.stop()

if __name__ == "__main__":
    list_bronze_files()


Listing files in bronze layer...

Found 1 file(s) in bronze layer:

  - employees.json

File list also saved to bronze_files_list.txt


NameError: name 'spark' is not defined