# Getting Started with Spark
This notebook demonstrates basic Spark operations in the data platform

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg

# Create Spark session
spark = SparkSession.builder \
    .appName("GettingStarted") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

print(f"‚úÖ Spark {spark.version} session created successfully!")

In [None]:
# Create a sample DataFrame
data = [
    (1, "Alice", 25, "Engineering"),
    (2, "Bob", 30, "Sales"),
    (3, "Charlie", 35, "Engineering"),
    (4, "Diana", 28, "Marketing"),
    (5, "Eve", 32, "Engineering")
]

columns = ["id", "name", "age", "department"]
df = spark.createDataFrame(data, columns)

print("üìä Sample DataFrame:")
df.show()

In [None]:
# Basic operations
print("üìà Department Statistics:")
dept_stats = df.groupBy("department") \
    .agg(
        count("*").alias("count"),
        avg("age").alias("avg_age")
    ) \
    .orderBy("count", ascending=False)

dept_stats.show()

In [None]:
# Filter example
print("üîç Engineering employees:")
engineering = df.filter(col("department") == "Engineering")
engineering.show()

In [None]:
# Write to MinIO (S3)
output_path = "s3a://test/sample_data"
print(f"üíæ Writing data to MinIO: {output_path}")

df.write \
    .mode("overwrite") \
    .parquet(output_path)

print("‚úÖ Data written successfully!")

In [None]:
# Read back from MinIO
print("üìñ Reading data from MinIO...")
df_read = spark.read.parquet(output_path)
df_read.show()
print("‚úÖ Data read successfully!")

In [None]:
# Clean up
spark.stop()
print("üëã Spark session stopped")