# Simple Spark Data Example

This notebook demonstrates:
1. Creating a Spark session with executors
2. Creating sample data
3. Writing data to the `/data` shared volume
4. Reading the data back
5. Performing basic transformations

This tests both the Spark connectivity and the shared storage mount.

In [None]:
# Import required libraries
import sys
import os
from datetime import datetime, timedelta
import random

# Add the utils directory to the path
sys.path.append('/opt/teehr')

from simple_spark_helper import create_spark_session

In [None]:
# Uncomment to test write access to MinIO to in-cluster bucket.
# os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
# os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin123"

In [None]:
# Create Spark session with 2 executors for testing
spark = create_spark_session(
    app_name="TEEHR-Data-Example",
    executor_instances=2,
    executor_memory="1g",
    executor_cores=1,
    driver_memory="1g"
)

print("\n🎉 Spark session ready!")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

In [None]:
# Test basic Spark functionality
print("📊 Testing basic Spark operations...")

# Create a simple DataFrame
data = [(i, f"name_{i}", random.uniform(10.0, 100.0)) for i in range(1000)]
columns = ["id", "name", "value"]

df = spark.createDataFrame(data, columns)

print(f"✅ Created DataFrame with {df.count()} rows")
print("\n📋 Sample data:")
df.show(10)

In [None]:
# Check that /data directory is accessible
data_dir = "/data"
print(f"📁 Checking data directory: {data_dir}")

if os.path.exists(data_dir):
    print(f"✅ Data directory exists")
    print(f"   Directory contents: {os.listdir(data_dir)}")
    
    # Test write permissions
    test_file = os.path.join(data_dir, "test_write.txt")
    try:
        with open(test_file, "w") as f:
            f.write("test")
        os.remove(test_file)
        print(f"✅ Write permissions confirmed")
    except Exception as e:
        print(f"❌ Write permission test failed: {e}")
else:
    print(f"❌ Data directory does not exist!")

In [None]:
# Debug: Check detailed permissions and ownership
import subprocess
import stat

print(f"🔍 Detailed investigation of {data_dir}:")

# Check permissions and ownership
try:
    stat_info = os.stat(data_dir)
    mode = stat.filemode(stat_info.st_mode)
    uid = stat_info.st_uid
    gid = stat_info.st_gid
    
    print(f"   Permissions: {mode}")
    print(f"   Owner UID: {uid}")
    print(f"   Group GID: {gid}")
    
    # Check current user
    import pwd
    import os
    current_uid = os.getuid()
    current_gid = os.getgid()
    
    try:
        user_info = pwd.getpwuid(current_uid)
        username = user_info.pw_name
    except:
        username = "unknown"
    
    print(f"   Current user: {username} (UID: {current_uid}, GID: {current_gid})")
    
    # Check if we can read the directory
    readable = os.access(data_dir, os.R_OK)
    writable = os.access(data_dir, os.W_OK)
    executable = os.access(data_dir, os.X_OK)
    
    print(f"   Directory access: Read={readable}, Write={writable}, Execute={executable}")
    
except Exception as e:
    print(f"   Error getting stat info: {e}")

# Try to see what's inside with ls -la
try:
    result = subprocess.run(['ls', '-la', data_dir], capture_output=True, text=True)
    print(f"\n📂 Directory listing (ls -la {data_dir}):")
    print(result.stdout)
    if result.stderr:
        print(f"   Errors: {result.stderr}")
except Exception as e:
    print(f"   Could not run ls command: {e}")

# Check parent directory permissions too
parent_dir = os.path.dirname(data_dir.rstrip('/'))
if parent_dir and parent_dir != data_dir:
    try:
        result = subprocess.run(['ls', '-la', parent_dir], capture_output=True, text=True)
        print(f"\n📁 Parent directory listing (ls -la {parent_dir}):")
        print(result.stdout)
    except Exception as e:
        print(f"   Could not list parent directory: {e}")

In [None]:
# Write DataFrame to the /data directory as Parquet
output_path = "/data/spark_example_data"

print(f"💾 Writing data to: {output_path}")

# Write as Parquet with overwrite mode
df.write \
  .mode("overwrite") \
  .option("compression", "snappy") \
  .parquet(output_path)

print("✅ Data written successfully!")

# Check what was created
if os.path.exists(output_path):
    files = os.listdir(output_path)
    print(f"📂 Files created: {files}")
    
    # Show file sizes
    for file in files:
        file_path = os.path.join(output_path, file)
        if os.path.isfile(file_path):
            size = os.path.getsize(file_path)
            print(f"   📄 {file}: {size:,} bytes")

In [None]:
# Read the data back from /data
print(f"📖 Reading data back from: {output_path}")

df_read = spark.read.parquet(output_path)

print(f"✅ Successfully read {df_read.count()} rows")
print("\n🔍 Schema of read data:")
df_read.printSchema()

print("\n📋 Sample of read data:")
df_read.show(10)

In [None]:
# Perform some basic transformations to test Spark processing
print("🔄 Performing data transformations...")

# Add some computed columns
from pyspark.sql.functions import col, when, round, avg, max, min, count

df_transformed = df_read.withColumn(
    "value_rounded", round(col("value"), 2)
).withColumn(
    "category", when(col("value") < 30, "low")
                .when(col("value") < 70, "medium")
                .otherwise("high")
)

print("✅ Added computed columns")
df_transformed.show(10)

# Perform aggregations
print("\n📊 Computing aggregations...")
stats = df_transformed.groupBy("category").agg(
    count("*").alias("count"),
    avg("value").alias("avg_value"),
    min("value").alias("min_value"),
    max("value").alias("max_value")
)

print("📈 Statistics by category:")
stats.show()

In [None]:
# Test multiple write formats to /data
print("📝 Testing different file formats...")

# Take a smaller sample for format testing
sample_df = df_transformed.limit(100)

formats_to_test = {
    "csv": "/data/spark_example.csv",
    "json": "/data/spark_example.json",
    "parquet": "/data/spark_example.parquet"
}

for format_name, path in formats_to_test.items():
    try:
        if format_name == "csv":
            sample_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(path)
        elif format_name == "json":
            sample_df.coalesce(1).write.mode("overwrite").json(path)
        elif format_name == "parquet":
            sample_df.write.mode("overwrite").parquet(path)
        
        print(f"✅ {format_name.upper()} written to {path}")
        
        # Check file size
        if os.path.exists(path):
            total_size = sum(os.path.getsize(os.path.join(path, f)) 
                           for f in os.listdir(path) 
                           if os.path.isfile(os.path.join(path, f)))
            print(f"   📊 Total size: {total_size:,} bytes")
            
    except Exception as e:
        print(f"❌ Failed to write {format_name}: {e}")

In [None]:
# Clean up and show summary
print("\n🧹 Cleaning up and summary...")

# Show final directory contents
print("\n📂 Final /data directory contents:")
for item in os.listdir("/data"):
    item_path = os.path.join("/data", item)
    if os.path.isdir(item_path):
        file_count = len([f for f in os.listdir(item_path) if os.path.isfile(os.path.join(item_path, f))])
        print(f"   📁 {item}/ ({file_count} files)")
    else:
        size = os.path.getsize(item_path)
        print(f"   📄 {item} ({size:,} bytes)")

# Show Spark application info
print(f"\n🎯 Spark Application Summary:")
print(f"   - Application ID: {spark.sparkContext.applicationId}")
print(f"   - Application Name: {spark.sparkContext.appName}")
print(f"   - Master: {spark.sparkContext.master}")
print(f"   - Spark UI: {spark.sparkContext.uiWebUrl}")
# Note: Executor count not available in this Spark version

In [None]:
spark.sql("USE iceberg;")
spark.sql("CREATE NAMESPACE IF NOT EXISTS teehr;")

In [None]:
spark.sql("DROP TABLE IF EXISTS iceberg.teehr.sample")
sample_df.writeTo("iceberg.teehr.sample").create()
spark.sql("SELECT * FROM iceberg.teehr.sample").show()

In [None]:
# Stop Spark session
print("🛑 Stopping Spark session...")
spark.stop()
print("✅ Spark session stopped successfully!")

print("\n🎉 Example completed successfully!")
print("\n📝 What we tested:")
print("   ✅ Spark session creation with Kubernetes executors")
print("   ✅ Data creation and basic transformations")
print("   ✅ Writing data to shared /data volume")
print("   ✅ Reading data back from /data volume")
print("   ✅ Multiple file formats (CSV, JSON, Parquet)")
print("   ✅ Aggregations and distributed processing")
print("   ✅ Persistent storage verification")