In [None]:
%load_ext dotenv
%dotenv

In [None]:
import logging
import os
from typing import Optional

from pyspark.sql import SparkSession

# Setup basic logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class SimpleSparkSession:
    """Simple Spark session builder for Jupyter notebooks"""

    def __init__(
            self,
            app_name="Jupyter Spark Session",
            master="local[*]",
            spark_config=None,
            enable_hive_support=False,
            # S3 configuration
            s3_bucket_name=None,
            s3_endpoint=None,
            s3_access_key=None,
            s3_secret_key=None,
            s3_region="us-east-1",
            s3_path_style_access=True,
            # PostgreSQL configuration
            postgres_config=None,
            # Package configuration
            packages=None
    ):
        self.app_name = app_name
        self.master = master
        self.spark_config = spark_config or {}
        self.enable_hive_support = enable_hive_support

        # S3 config
        self.s3_bucket_name = s3_bucket_name
        self.s3_endpoint = s3_endpoint
        self.s3_access_key = s3_access_key
        self.s3_secret_key = s3_secret_key
        self.s3_region = s3_region
        self.s3_path_style_access = s3_path_style_access

        # PostgreSQL config
        self.postgres_config = postgres_config
        self.jdbc_driver_path: Optional[str] = None

        # Packages
        self.packages = packages or []

        self._session = None

    def build_session(self):
        """Build and return a SparkSession"""
        if self._session is not None:
            return self._session

        # Start building the session
        builder = SparkSession.builder.appName(self.app_name).master(self.master)

        builder = builder.config("spark.sql.execution.arrow.pyspark.enabled", "true") \
            .config("spark.driver.extraJavaOptions", "-Djava.security.manager=allow") \
            .config("spark.executor.extraJavaOptions", "-Djava.security.manager=allow") \
 \
            # Add Hive support if requested
        if self.enable_hive_support:
            builder = builder.enableHiveSupport()

        if self.jdbc_driver_path:
            builder = builder.config("spark.driver.extraClassPath", self.jdbc_driver_path)
            builder = builder.config("spark.executor.extraClassPath", self.jdbc_driver_path)

        # Add all configuration options
        for key, value in self.spark_config.items():
            builder = builder.config(key, value)

        # Configure packages
        if self.packages:
            packages = ",".join(self.packages)
            builder = builder.config("spark.jars.packages", packages)

        # Add S3 configuration if credentials provided
        if self.s3_access_key and self.s3_secret_key:
            builder = builder.config("spark.hadoop.fs.s3a.access.key", self.s3_access_key)
            builder = builder.config("spark.hadoop.fs.s3a.secret.key", self.s3_secret_key)
            builder = builder.config("spark.hadoop.fs.s3a.aws.credentials.provider",
                                     "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")

            # Config for non-AWS S3
            if self.s3_endpoint:
                builder = builder.config("spark.hadoop.fs.s3a.endpoint", self.s3_endpoint)
                builder = builder.config("spark.hadoop.fs.s3a.endpoint.region", self.s3_region)

            # Path style access for non-AWS implementations
            if self.s3_path_style_access:
                builder = builder.config("spark.hadoop.fs.s3a.path.style.access", "true")
                builder = builder.config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
                builder = builder.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
                builder = builder.config("spark.hadoop.fs.s3a.multiobjectdelete.enable", "false")

        # Build the session
        logger.info(f"Building Spark session with app name: {self.app_name}, master: {self.master}")
        self._session = builder.getOrCreate()

        return self._session

    def get_session(self):
        """Get the current SparkSession or create a new one"""
        return self.build_session()

    def stop_session(self):
        """Stop the current Spark session if it exists"""
        if self._session is not None:
            self._session.stop()
            self._session = None
            logger.info("Spark session stopped")

In [None]:
spark = SimpleSparkSession(
    app_name="Data Analysis Notebook",
    packages=[
        "org.postgresql:postgresql:42.5.4",
        "org.apache.hadoop:hadoop-aws:3.3.4",
        "com.amazonaws:aws-java-sdk-bundle:1.12.426"
    ],
    s3_access_key=os.getenv("S3_ACCESS_KEY"),
    s3_secret_key=os.getenv("S3_SECRET_KEY"),
    s3_endpoint=os.getenv("S3_ENDPOINT"),
    s3_region="garage",
    s3_path_style_access=True,
    postgres_config={
        "user": os.getenv("POSTGRES_USER"),
        "password": os.getenv("POSTGRES_PASSWORD"),
        "driver": "org.postgresql.Driver",
        "currentSchema": "public"
    },
    enable_hive_support=False,
    s3_bucket_name="traffy-troffi"
).get_session()

In [None]:
df = spark.read.jdbc(table='traffy_fondue',
                     url="jdbc:postgresql://localhost:5432/traffy-troffi",
                     properties={"user": "postgres", "password": "troffi",
                                 "driver": "org.postgresql.Driver",
                                 "currentSchema": "public"}
                     )
df.printSchema()

In [None]:
import pyspark.sql.functions as F

In [None]:
df.filter((F.col("district") == 'ราชเทวี') & (F.array_contains('categories', 'ถนน'))).show()

In [None]:
short_df = df.sample(0.01)
short_df.show()

In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType
from shapely.geometry import Point
import geopandas as gpd

# Initialize Spark session
spark = SparkSession.builder.appName("GeospatialProcessing").getOrCreate()

# Define return schema for the UDF with two fields
return_schema = StructType([
    StructField("e_subdistrict", StringType(), True),
    StructField("e_district", StringType(), True)
])


# Define function to check if a point is in polygon
def point_in_region(lat, lon, regions_list):
    if lat is None or lon is None:
        return None, None

    try:
        point = Point(float(lon), float(lat))
        for region in regions_list:
            if region["geometry"].contains(point):
                # Return subdistrict and district as a tuple
                return (
                    region["properties"]["SNAME"].replace("แขวง", "").strip(),
                    region["properties"]["DNAME"].replace("เขต", "").strip()
                )
        return None, None
    except Exception as e:
        print(f"Error: {str(e)}")
        return None, None


# Read regions from a file
regions_gdf = gpd.read_file("../data/subdistricts.geojson")

# Convert to a format that can be serialized and broadcast
regions_data = []
for _, region in regions_gdf.iterrows():
    # Convert to a simpler dict structure
    properties = {}
    for key, value in region.items():
        if key != 'geometry':
            properties[key] = value

    regions_data.append({
        "geometry": region.geometry,
        "properties": properties
    })

# Broadcast the region data to all workers
regions_broadcast = spark.sparkContext.broadcast(regions_data)


# Create UDF that returns a struct with two fields
def check_point_wrapper(lat, lon):
    result = point_in_region(lat, lon, regions_broadcast.value)
    return result


# Register UDF with the struct return type
point_in_region_udf = udf(check_point_wrapper, return_schema)

# Apply the UDF to find a region for each point
result_df = short_df.withColumn(
    "location_info",
    point_in_region_udf(short_df["latitude"], short_df["longitude"])
)

# Extract the struct fields into separate columns
final_df = result_df.select(
    "*",
    result_df["location_info.e_subdistrict"].alias("e_subdistrict"),
    result_df["location_info.e_district"].alias("e_district")
).drop("location_info")

# Show results
final_df.show()

+-----------+--------------------+--------------------+--------------------+--------------------+--------+---------+---------+--------------+--------------------+-----------------+--------------+----------+
|  ticket_id|           complaint|           timestamp|               image|         image_after|latitude|longitude| district|   subdistrict|          categories|   categories_idx| e_subdistrict|e_district|
+-----------+--------------------+--------------------+--------------------+--------------------+--------+---------+---------+--------------+--------------------+-----------------+--------------+----------+
|2024-8G9PBH|ขอลอกท่อรามอินทรา...|2024-08-08 07:40:...|https://storage.g...|https://storage.g...|13.86775|100.61015|   บางเขน|    อนุสาวรีย์|       [ท่อระบายน้ำ]|           [11.0]|    อนุสาวรีย์|    บางเขน|
|2024-A23C77|ทางก่อสร้างแจ้งว่...|2024-05-01 11:47:...|https://storage.g...|https://storage.g...|13.76625|100.34874| ทวีวัฒนา|      ทวีวัฒนา|       [ความปลอดภัย]|          

In [30]:
regions_data

[{'geometry': <POLYGON ((100.653 13.886, 100.653 13.886, 100.653 13.886, 100.653 13.886, 1...>,
  'properties': {'PNAME': 'กรุงเทพมหานคร',
   'SUBDIST_ID': 64,
   'DCODE': '1005',
   'AREA_': 22922521.274935,
   'SUB_CODE': '0508',
   'PERIMETER': 24078.875,
   'DNAME': 'เขตบางเขน',
   'OBJECTID': 52.0,
   'SNAME': 'แขวงท่าแร้ง',
   'SCODE': '100508',
   'SCODE_BMA': '100502',
   'PCODE': '10',
   'SUBDIST_': 65,
   'water': 3007}},
 {'geometry': <POLYGON ((100.788 13.879, 100.788 13.879, 100.788 13.879, 100.788 13.879, 1...>,
  'properties': {'PNAME': 'กรุงเทพมหานคร',
   'SUBDIST_ID': 66,
   'DCODE': '1046',
   'AREA_': 17349298.283536,
   'SUB_CODE': '4605',
   'PERIMETER': 19713.927734,
   'DNAME': 'เขตคลองสามวา',
   'OBJECTID': 55.0,
   'SNAME': 'แขวงทรายกองดินใต้',
   'SCODE': '104605',
   'SCODE_BMA': '104605',
   'PCODE': '10',
   'SUBDIST_': 68,
   'water': 6214}},
 {'geometry': <POLYGON ((100.763 13.869, 100.763 13.869, 100.763 13.869, 100.763 13.869, 1...>,
  'properties': {'

In [32]:
spark.stop()