# SNOWPARK CONNECT FOR APACHE SPARK *TM*

In this demo we will see how you can : 
- Connect to the Snowpark Connect server:
- Execute simple PySpark code
- Create nested table structures and write to Snowflake
- Generate synthetic data for support cases and write to Snowfalke
- Create a spark UDF, register it and invoke it directly
- Create a Snowflake Python function and invoke it with SQL Passthrough from Spark
 


Establish spark connection and start the snowpark connect server

In [None]:
from snowflake import snowpark_connect
from snowflake.snowpark.context import get_active_session

session = get_active_session()
print(session)

spark = snowpark_connect.server.init_spark_session()

Query structured data from Snowflake table inside Spark

In [None]:
df = spark.sql("show tables").limit(10)
df.show()

We will show an extremely simple one as below. If you are interested to use [Pyspark examples](https://docs.google.com/document/d/1F9mmoSP4DuObNREvbQ5lClrExrBThbB_Ww6_7NsI0es/edit?tab=t.ltuj1iuzoic1#heading=h.wl5zn2ai10cn) or start exploring various guides from sites such as [Kaggle](https://www.kaggle.com/code/kkhandekar/apache-spark-beginner-tutorial)

In [None]:
from pyspark.sql import Row

df = spark.createDataFrame([
                Row(a=1, b=2.),
                Row(a=2, b=3.),
                Row(a=4, b=5.),])
df.show()


#### Generate synthetic support case data using PySpark
The below code defines a schema and then populates it with various data types, including random strings, dates, and boolean values.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, TimestampType, BooleanType
from pyspark.sql.functions import lit, rand, expr, date_add, to_timestamp
import random
from datetime import datetime, timedelta



schema = """
    CASE_ID STRING,
    CATEGORY STRING,
    CASE_TITLE STRING,
    CASE_DESCRIPTION STRING,
    LAST_UPDATE TIMESTAMP,
    STATUS STRING,
    DATE_CREATED TIMESTAMP,
    DATE_CLOSED TIMESTAMP,
    REQUIRED_ESCALATION BOOLEAN
"""

# --- Data Generation Parameters ---
num_records = 1000 

categories = ["Technical Issue", "Billing Inquiry", "Feature Request", "Account Management", "Bug Report", "General Question"]
statuses = ["Open", "In Progress", "Resolved", "Closed", "Pending Customer"]
case_titles_templates = [
    "Problem with {}",
    "Issue accessing {}",
    "Request for new feature: {}",
    "Account update required for {}",
    "Bug in {} module",
    "Question about {} functionality"
]
description_templates = [
    "User reported that {} is not working as expected. Needs investigation.",
    "Customer is unable to {} after recent update.",
    "Details: User needs assistance with {}. Please provide guidance.",
    "Troubleshooting steps taken: {}. Still experiencing the issue.",
    "New feature request: {}. Describe desired functionality and benefits.",
    "Error observed: {}. Stack trace attached if available."
]

# Function to generate a random datetime within a range
def random_date(start_date, end_date):
    # Ensure start_date is not after end_date. If they are the same, return start_date.
    if start_date >= end_date:
        return start_date # Or handle as an error, but returning start_date is safer for small ranges

    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days

    # Ensure days_between_dates is at least 0 before calling randrange
    # randrange(0) is valid and returns 0, so if days_between_dates is 0, this works.
    if days_between_dates < 0: # This should ideally not happen with the check above, but as a safeguard
        days_between_dates = 0

    random_number_of_days = random.randrange(days_between_dates + 1) # +1 to include the end_date day
    
    random_date_offset = start_date + timedelta(days=random_number_of_days)
    
    
    if days_between_dates == 0:
        # If dates are the same, pick a time between start_date.time() and end_date.time()
        total_seconds_in_range = int((end_date - start_date).total_seconds())
        if total_seconds_in_range <= 0:
            return start_date # Should already be handled by the start_date >= end_date check
        random_seconds_offset = random.randrange(total_seconds_in_range + 1)
        return start_date + timedelta(seconds=random_seconds_offset)
    else:
        # If dates span multiple days, pick a full day offset and then a random time within that day
        random_seconds_offset = random.randrange(86400) # seconds in a full day
        return random_date_offset + timedelta(seconds=random_seconds_offset)


# Generate data row by row
data = []
current_time = datetime.now()
start_creation_date = current_time - timedelta(days=365) # Cases created within the last year

for i in range(num_records):
    case_id = f"CASE-{100000 + i}"
    category = random.choice(categories)
    
    placeholder = f"Module {random.randint(1, 10)}" if "Module" in str(random.choice(case_titles_templates)).format('{}') else \
                  f"Service {chr(65 + random.randint(0, 5))}" if "Service" in str(random.choice(case_titles_templates)).format('{}') else \
                  f"Feature {random.randint(1, 20)}" if "Feature" in str(random.choice(case_titles_templates)).format('{}') else \
                  f"User Account {random.randint(100, 999)}"
                  
    case_title = random.choice(case_titles_templates).format(placeholder)
    case_description = random.choice(description_templates).format(placeholder)

    # Date Created: Ensure it's not in the future
    date_created = random_date(start_creation_date, current_time)
    
    date_closed = None
    status = random.choice(statuses)
    
    if status in ["Resolved", "Closed"]:
        
        date_closed_earliest = date_created + timedelta(minutes=1)
        date_closed_latest = date_created + timedelta(days=30)

    
        effective_date_closed_end = min(date_closed_latest, current_time)

        if date_closed_earliest < effective_date_closed_end:
            date_closed = random_date(date_closed_earliest, effective_date_closed_end)
        else:
            
            date_closed = effective_date_closed_end 

    if date_closed:
        last_update = random_date(date_created, date_closed)
    else:
        last_update = random_date(date_created, current_time)
    
    required_escalation = random.choice([True, False, False, False]) 

    data.append((case_id, category, case_title, case_description, last_update, status, date_created, date_closed, required_escalation))

# Create DataFrame
df = spark.createDataFrame(data).toDF(*[field.strip().split()[0] for field in schema.strip().split(",")])

# Show a sample of the generated data
print(f"\nGenerated {num_records} records.")
df.show(5, truncate=False)



df.write.mode("overwrite").saveAsTable("Support_Cases")

### Spark UDF
The below code snippet reads data from a Snowflake table, applies a custom spark User-Defined Function (UDF) to categorize text, and then writes the transformed data back to a new Snowflake table.

In [None]:
import os

from pyspark.sql import SparkSession
from pyspark.sql import Row

from pyspark.sql.functions import col, unix_timestamp, when, regexp_replace,lower
from pyspark.sql.types import StringType

from pyspark.sql.functions import udf

def udf_parse_case_description(input_desc):
    try:
        if input_desc is None: # Handle None inputs gracefully
            return "unknown"
            
        desc = input_desc.lower()

        # Billing/Payment
        if any(keyword in desc for keyword in ["refund", "charged twice", "duplicate charges", "invoice", "payment", "billing error", "subscription", "credit card", "transaction"]):
            return "billing_payment"
        
        # Shipping/Delivery
        elif any(keyword in desc for keyword in ["not delivered", "missing package", "stuck in transit", "tracking", "shipment", "delivery", "late", "lost"]):
            return "shipping_delivery"
            
        # Technical Issue
        elif any(keyword in desc for keyword in ["not working", "error", "bug", "malfunction", "crashed", "login", "password reset", "connectivity", "performance", "glitch", "broken"]):
            return "technical_issue"

        # Account Management
        elif any(keyword in desc for keyword in ["account access", "update profile", "change email", "close account", "password", "username", "profile update"]):
            return "account_management"

        # Product Inquiry/Feature Request
        elif any(keyword in desc for keyword in ["availability", "warranty", "discount", "feature request", "compatibility", "specs", "how to", "information about"]):
            return "product_inquiry_feature_request"
            
        # High Priority/Escalation
        elif any(keyword in desc for keyword in ["urgent", "critical", "escalate", "immediate attention", "severe", "blocking"]):
            return "high_priority_escalation"
        
        # General Inquiry - Fallback
        else:
            return "general_inquiry"

    except Exception as e:
        
        return "error"

# Register Spark UDF
parse_case_udf = udf(udf_parse_case_description, StringType())
#spark.udf.register("udf_parse_case_description", udf_parse_case_description, StringType())

table_name = "SUPPORT_CASES" 
df = spark.read.table("SUPPORT_CASES")
df1 = df.withColumn("INTENT", parse_case_udf(col("CASE_DESCRIPTION")))

#df1 = spark.sql(f"SELECT *, udf_parse_case_description(CASE_DESCRIPTION) AS INTENT FROM {table_name}")

df1.write.mode("overwrite").saveAsTable("Transformed_Cases")



We can query the Transformed_Cases table with familiar Spark SQL here 

In [None]:
df= spark.sql("select * from TRANSFORMED_CASES");
df.show()

Create a simple Python UDF in Snowflake to categorize text data stored in tables. 

In [None]:
session.sql("""CREATE OR REPLACE FUNCTION udf_parse_review(input STRING)
RETURNS STRING
LANGUAGE PYTHON
RUNTIME_VERSION = '3.9'
HANDLER = 'parse'
AS
$$
def parse(input_desc):
    try:
        if input_desc is None: # Handle None inputs gracefully
            return "unknown"
            
        desc = input_desc.lower()

        # Billing/Payment
        if any(keyword in desc for keyword in ["refund", "charged twice", "duplicate charges", "invoice", "payment", "billing error", "subscription", "credit card", "transaction"]):
            return "billing_payment"
        
        # Shipping/Delivery
        elif any(keyword in desc for keyword in ["not delivered", "missing package", "stuck in transit", "tracking", "shipment", "delivery", "late", "lost"]):
            return "shipping_delivery"
            
        # Technical Issue
        elif any(keyword in desc for keyword in ["not working", "error", "bug", "malfunction", "crashed", "login", "password reset", "connectivity", "performance", "glitch", "broken"]):
            return "technical_issue"

        # Account Management
        elif any(keyword in desc for keyword in ["account access", "update profile", "change email", "close account", "password", "username", "profile update"]):
            return "account_management"

        # Product Inquiry/Feature Request
        elif any(keyword in desc for keyword in ["availability", "warranty", "discount", "feature request", "compatibility", "specs", "how to", "information about"]):
            return "product_inquiry_feature_request"
            
        # High Priority/Escalation
        elif any(keyword in desc for keyword in ["urgent", "critical", "escalate", "immediate attention", "severe", "blocking"]):
            return "high_priority_escalation"
        
        # General Inquiry - Fallback
        else:
            return "general_inquiry"

    except Exception as e:
        
        return "error"
$$;
""").collect()

### Enable SQL Passthrough for Snowpark Connect 
Configure to allow direct passthrough of SQL queries to Snowflake. When snowpark.connect.sql.passthrough is set to True, spark.sql() calls will send the SQL statement directly to the Snowflake backend for execution without PySpark's local processing. This is particularly useful for leveraging Snowflake's native capabilities in spark code. 


In [None]:
spark.conf.set("snowpark.connect.sql.passthrough", True)

spark.sql("""
    CREATE OR REPLACE VIEW REVIEW_INTENT_VIEW AS
    SELECT 
        CASE_DESCRIPTION,
        udf_parse_review(CASE_DESCRIPTION) AS INTENT
    FROM SUPPORT_CASES
""")

Query the newly created view

In [None]:
spark.read.table('REVIEW_INTENT_VIEW').show(100)

## End of Notebook
