In [1]:
# --- JUPYTER NOTEBOOK CELL ---

from pyspark.sql import SparkSession
import sys

# --- Configuration for JDBC Driver ---
# We use the Maven coordinates for the PostgreSQL JDBC driver.
POSTGRES_JDBC_PACKAGE = "org.postgresql:postgresql:42.5.0"

# --- 1. Define Connection Parameters ---
# Using the successful credentials: DB=spark_db, User=spark_user, Pass=1110897
POSTGRES_URL = "jdbc:postgresql://localhost:5432/spark_db"
POSTGRES_PROPERTIES = {
    "user": "spark_user",      
    "password": "1110897",     
    "driver": "org.postgresql.Driver"
}
TABLE_NAME = "sample_data"

try:
    print("1. Setting up SparkSession...")
    
    # *** CRUCIAL CHANGE FOR NOTEBOOK ***
    # Pass the JDBC package directly to the SparkSession builder
    spark = SparkSession.builder \
        .appName("PostgresIntegration_Jupyter") \
        .config("spark.jars.packages", POSTGRES_JDBC_PACKAGE) \
        .getOrCreate()
    
    print("...SparkSession created successfully with JDBC driver.")
    
    # --- 2. Create a Sample DataFrame ---
    data = [("A", 10), ("B", 20), ("C", 30)]
    columns = ["key", "value"]
    source_df = spark.createDataFrame(data, columns)
    print("\nSample DataFrame to write to Postgres:")
    source_df.show()

    # --- 3. Write Data to PostgreSQL (Load) ---
    print(f"\nWriting data to PostgreSQL table '{TABLE_NAME}'...")
    source_df.write \
        .format("jdbc") \
        .option("url", POSTGRES_URL) \
        .option("dbtable", TABLE_NAME) \
        .mode("overwrite") \
        .options(**POSTGRES_PROPERTIES) \
        .save()
    print("...Data successfully written.")

    # --- 4. Read Data from PostgreSQL (Extract) ---
    print(f"\nReading data back from PostgreSQL table '{TABLE_NAME}'...")
    read_df = spark.read \
        .format("jdbc") \
        .option("url", POSTGRES_URL) \
        .option("dbtable", TABLE_NAME) \
        .options(**POSTGRES_PROPERTIES) \
        .load()

    print("...Data successfully read and verified:")
    read_df.show()
    print("\nPySpark-PostgreSQL Integration Test: SUCCESS!")

except Exception as e:
    print(f"\nPySpark-PostgreSQL Integration Test: FAILED!")
    # Print the specific error for debugging
    print(f"Error: {e}")
    # This ensures a clean exit in a terminal, though less critical in a notebook
    sys.exit(1)

finally:
    # Stop the Spark Session
    if 'spark' in locals() and spark is not None:
        spark.stop()
        print("\nSpark Session stopped.")

1. Setting up SparkSession...


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/somnath/.ivy2.5.2/cache
The jars for the packages stored in: /home/somnath/.ivy2.5.2/jars
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-27dc6d01-1344-454a-93b0-d7c9badf497d;1.0
	confs: [default]
	found org.postgresql#postgresql;42.5.0 in central
	found org.checkerframework#checker-qual;3.5.0 in central
:: resolution report :: resolve 181ms :: artifacts dl 7ms
	:: modules in use:
	org.checkerframework#checker-qual;3.5.0 from central in [default]
	org.postgresql#postgresql;42.5.0 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------

...SparkSession created successfully with JDBC driver.

Sample DataFrame to write to Postgres:


                                                                                

+---+-----+
|key|value|
+---+-----+
|  A|   10|
|  B|   20|
|  C|   30|
+---+-----+


Writing data to PostgreSQL table 'sample_data'...


                                                                                

...Data successfully written.

Reading data back from PostgreSQL table 'sample_data'...
...Data successfully read and verified:
+---+-----+
|key|value|
+---+-----+
|  B|   20|
|  A|   10|
|  C|   30|
+---+-----+


PySpark-PostgreSQL Integration Test: SUCCESS!

Spark Session stopped.
