In [1]:
%load_ext dotenv
%dotenv

In [2]:
import logging
from typing import Optional
import os

from pyspark.sql import SparkSession

# Setup basic logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class SimpleSparkSession:
    """Simple Spark session builder for Jupyter notebooks"""

    def __init__(
            self,
            app_name="Jupyter Spark Session",
            master="local[*]",
            spark_config=None,
            enable_hive_support=False,
            # S3 configuration
            s3_bucket_name=None,
            s3_endpoint=None,
            s3_access_key=None,
            s3_secret_key=None,
            s3_region="us-east-1",
            s3_path_style_access=True,
            # PostgreSQL configuration
            postgres_config=None,
            # Package configuration
            packages=None
    ):
        self.app_name = app_name
        self.master = master
        self.spark_config = spark_config or {}
        self.enable_hive_support = enable_hive_support

        # S3 config
        self.s3_bucket_name = s3_bucket_name
        self.s3_endpoint = s3_endpoint
        self.s3_access_key = s3_access_key
        self.s3_secret_key = s3_secret_key
        self.s3_region = s3_region
        self.s3_path_style_access = s3_path_style_access

        # PostgreSQL config
        self.postgres_config = postgres_config
        self.jdbc_driver_path: Optional[str] = None

        # Packages
        self.packages = packages or []

        self._session = None

    def build_session(self):
        """Build and return a SparkSession"""
        if self._session is not None:
            return self._session

        # Start building the session
        builder = SparkSession.builder.appName(self.app_name).master(self.master)

        builder = builder.config("spark.sql.execution.arrow.pyspark.enabled", "true") \
            .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
            .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
 \
            # Add Hive support if requested
        if self.enable_hive_support:
            builder = builder.enableHiveSupport()

        if self.jdbc_driver_path:
            builder = builder.config("spark.driver.extraClassPath", self.jdbc_driver_path)
            builder = builder.config("spark.executor.extraClassPath", self.jdbc_driver_path)

        # Add all configuration options
        for key, value in self.spark_config.items():
            builder = builder.config(key, value)

        # Configure packages
        if self.packages:
            packages = ",".join(self.packages)
            builder = builder.config("spark.jars.packages", packages)

        # Add S3 configuration if credentials provided
        if self.s3_access_key and self.s3_secret_key:
            builder = builder.config("spark.hadoop.fs.s3a.access.key", self.s3_access_key)
            builder = builder.config("spark.hadoop.fs.s3a.secret.key", self.s3_secret_key)
            builder = builder.config("spark.hadoop.fs.s3a.aws.credentials.provider",
                                     "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

            # Config for non-AWS S3
            if self.s3_endpoint:
                builder = builder.config("spark.hadoop.fs.s3a.endpoint", self.s3_endpoint)
                builder = builder.config("spark.hadoop.fs.s3a.endpoint.region", self.s3_region)

            # Path style access for non-AWS implementations
            if self.s3_path_style_access:
                builder = builder.config("spark.hadoop.fs.s3a.path.style.access", "true")
                builder = builder.config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
                builder = builder.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
                builder = builder.config("spark.hadoop.fs.s3a.multiobjectdelete.enable", "false")

        # Build the session
        logger.info(f"Building Spark session with app name: {self.app_name}, master: {self.master}")
        self._session = builder.getOrCreate()

        return self._session

    def get_session(self):
        """Get the current SparkSession or create a new one"""
        return self.build_session()

    def stop_session(self):
        """Stop the current Spark session if it exists"""
        if self._session is not None:
            self._session.stop()
            self._session = None
            logger.info("Spark session stopped")

In [3]:
spark = SimpleSparkSession(
    app_name="Data Analysis Notebook",
    packages=[
        "org.postgresql:postgresql:42.5.4",
        "org.apache.hadoop:hadoop-aws:3.3.4",
        "com.amazonaws:aws-java-sdk-bundle:1.12.426"
    ],
    s3_access_key=os.getenv("S3_ACCESS_KEY"),
    s3_secret_key=os.getenv("S3_SECRET_KEY"),
    s3_endpoint=os.getenv("S3_ENDPOINT"),
    s3_region="garage",
    s3_path_style_access=True,
    postgres_config={
        "user": os.getenv("POSTGRES_USER"),
        "password": os.getenv("POSTGRES_PASSWORD"),
        "driver": "org.postgresql.Driver",
        "currentSchema": "public"
    },
    enable_hive_support=False,
    s3_bucket_name="traffy-troffi"
).get_session()

INFO:__main__:Building Spark session with app name: Data Analysis Notebook, master: local[*]
25/05/08 16:42:09 WARN Utils: Your hostname, MacBook-Pro-khxng-Wichayada.local resolves to a loopback address: 127.0.0.1; using 10.203.69.21 instead (on interface en0)
25/05/08 16:42:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/namin/.ivy2/cache
The jars for the packages stored in: /Users/namin/.ivy2/jars
org.postgresql#postgresql added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-59cd2946-0e31-4bc8-aba2-9c003c503197;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/namin/Library/Caches/pypoetry/virtualenvs/traffy-troffi-sQpa58eM-py3.12/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.postgresql#postgresql;42.5.4 in central
	found org.checkerframework#checker-qual;3.5.0 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.426 in central
downloading https://repo1.maven.org/maven2/org/postgresql/postgresql/42.5.4/postgresql-42.5.4.jar ...
	[SUCCESSFUL ] org.postgresql#postgresql;42.5.4!postgresql.jar (1444ms)
downloading https://repo1.maven.org/maven2/org/checkerframework/checker-qual/3.5.0/checker-qual-3.5.0.jar ...
	[SUCCESSFUL ] org.checkerframework#checker-qual;3.5.0!checker-qual.jar (388ms)
:: resolution report :: resolve 1904ms :: artifacts dl 1838ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.426 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.checkerframework#checker-qual;3.5.0 from central in [default]
	org.postgresql#postgresql;42.5.4 from central in [default]
	org.w

In [4]:
spark.read.jdbc(table='traffy_fondue',
                url="jdbc:postgresql://localhost:5432/traffy-troffi",
                properties={"user": "postgres", "password": "troffi",
                            "driver": "org.postgresql.Driver",
                            "currentSchema": "public"}).printSchema()

root
 |-- ticket_id: string (nullable = true)
 |-- complaint: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- image: string (nullable = true)
 |-- image_after: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- district: string (nullable = true)
 |-- subdistrict: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- categories_idx: array (nullable = true)
 |    |-- element: float (containsNull = true)



25/05/08 16:42:28 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [5]:
import pyspark.sql.functions as F

In [6]:
spark.read.jdbc(table='traffy_fondue',
                url="jdbc:postgresql://localhost:5432/traffy-troffi",
                properties={"user": "postgres", "password": "troffi",
                            "driver": "org.postgresql.Driver",
                            "currentSchema": "public"}).filter(
    (F.col("district") == 'ราชเทวี') & (F.array_contains('categories', 'ถนน'))).show()

                                                                                

+-----------+--------------------+--------------------+--------------------+--------------------+--------+---------+--------+-----------+--------------------+--------------------+
|  ticket_id|           complaint|           timestamp|               image|         image_after|latitude|longitude|district|subdistrict|          categories|      categories_idx|
+-----------+--------------------+--------------------+--------------------+--------------------+--------+---------+--------+-----------+--------------------+--------------------+
|2024-8AKXPM|มีบ้านปล่อยน้ำจาก...|2024-11-06 08:35:...|https://storage.g...|https://storage.g...|13.75058|100.53849| ราชเทวี|ถนนเพชรบุรี|               [ถนน]|               [9.0]|
|2024-KPTM7X|พบคนไร้บ้านนอนที่...|2024-06-14 20:21:...|https://storage.g...|https://storage.g...|13.76059|100.53852| ราชเทวี|   ถนนพญาไท|[ทางเท้า, คนจรจัด...|    [10.0, 3.0, 9.0]|
|2024-MP49WU|14/5/67 ตามที่เคย...|2024-08-03 14:19:...|https://storage.g...|https://storage.g...|13.

In [None]:
spark.read.jdbc(table='traffy_fondue',
                url="jdbc:postgresql://localhost:5432/traffy-troffi",
                properties={"user": "postgres", "password": "troffi",
                            "driver": "org.postgresql.Driver",
                            "currentSchema": "public"}).count()

356257