In [1]:
from pyspark.sql import SparkSession

def check_bronze_schema_safe():
    spark = SparkSession.builder \
        .appName("Check Bronze Schema Debug") \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.4.1,com.amazonaws:aws-java-sdk-bundle:1.12.772") \
        .config("spark.hadoop.fs.s3a.endpoint", "http://127.0.0.1:9000") \
        .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
        .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .getOrCreate()

    spark.sparkContext.setLogLevel("WARN")

    path = "s3a://bronze/"
    print("Attempting to read JSON with multiline=true...")

    try:
        # FIX: Added multiline=true
        df = spark.read \
            .option("inferSchema", "true") \
            .option("multiline", "true") \
            .json(path)
            
        print("\n--- Schema Structure ---")
        df.printSchema()

        print("\n--- Data Preview ---")
        df.show(5, truncate=False)

    except Exception as e:
        print(f"\nError reading JSON: {e}")
        print("\n--- DEBUGGING: RAW FILE CONTENT ---")
        # If JSON fails, let's read it as simple text to see what the file actually looks like
        try:
            df_text = spark.read.text(path)
            print("Here is what the raw file looks like (first 10 lines):")
            df_text.show(10, truncate=False)
        except Exception as text_e:
            print(f"Could not read as text either: {text_e}")

    finally:
        spark.stop()

if __name__ == "__main__":
    check_bronze_schema_safe()

Attempting to read JSON with multiline=true...

--- Schema Structure ---
root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- employee_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)


--- Data Preview ---
+-----------------------------+---+----------------+-----------+-------------------+---------------------------------+
|address                      |age|department      |employee_id|name               |skills                           |
+-----------------------------+---+----------------+-----------+-------------------+---------------------------------+
|{Kolkata, India, West Bengal}|28 |Data Engineering|101        |Somnath Chakraborty|[Python, Azure, PySpark]         |
|{Bangalore, India, 

#STOP_THE SESSION

In [2]:
# The print statement is indented once (4 spaces)
try:
    spark.stop()
    print("✅ SparkSession successfully stopped.")
except NameError:
    print("⚠️ Error: 'spark' object not found. Ensure your SparkSession was initialized.")
# ...

⚠️ Error: 'spark' object not found. Ensure your SparkSession was initialized.


In [4]:
from pyspark.sql import SparkSession
import os

def check_version():
    """
    Initializes a basic Spark session and prints the Spark version 
    to help diagnose Scala version compatibility issues with Delta Lake.
    """
    print("--- Initializing Basic Spark Session ---")

    try:
        # Create a SparkSession without any specific external packages
        spark = SparkSession.builder \
            .appName("VersionChecker") \
            .getOrCreate()
        
        # Set log level to reduce console clutter
        spark.sparkContext.setLogLevel("ERROR")

        # 1. Print the official Spark version
        spark_version = spark.version
        print(f"\n✅ Detected Spark Version: {spark_version}")
        
        # 2. Infer the likely Scala compatibility
        # Spark 3.3 and older often use Scala 2.12. Spark 3.4+ often uses Scala 2.12 or 2.13.
        print("\n--- Compatibility Guidance ---")
        if spark_version.startswith("3.4") or spark_version.startswith("3.5"):
            print("Spark 3.4/3.5 can be built with Scala 2.12 or 2.13. You likely need `_2.12` or `_2.13` in your Delta package name.")
        elif spark_version.startswith("3."):
             print("Spark 3.x (older than 3.4) is typically built with Scala 2.12. Try `io.delta:delta-spark_2.12:2.4.0`.")
        else:
             print("Please check the official Delta Lake documentation for the package corresponding to this specific Spark version.")

        print("\n--- Full Build Info (may contain Scala version) ---")
        # This often reveals the specific build details including Scala version
        print(spark.sparkContext.getConf().get('spark.executor.extraJavaOptions'))
        
        spark.stop()

    except Exception as e:
        print(f"\n❌ Failed to initialize Spark or get version info: {e}")
        print("Ensure PySpark is correctly installed and configured.")

if __name__ == "__main__":
    check_version()

--- Initializing Basic Spark Session ---

✅ Detected Spark Version: 4.0.1

--- Compatibility Guidance ---
Please check the official Delta Lake documentation for the package corresponding to this specific Spark version.

--- Full Build Info (may contain Scala version) ---
-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-modules=jdk.incubator.vector --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-