In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Create Spark Session
print("Creating Spark Session...")
spark = SparkSession.builder \
    .appName("PySpark Test") \
    .master("local[*]") \
    .getOrCreate()

print("âœ“ Spark Session created successfully!")
print(f"Spark Version: {spark.version}")
print("-" * 50)

# Create sample data
data = [
    ("Alice", 25, "Engineer", 75000),
    ("Bob", 30, "Manager", 85000),
    ("Charlie", 35, "Engineer", 80000),
    ("Diana", 28, "Analyst", 70000),
    ("Eve", 32, "Manager", 90000)
]

columns = ["Name", "Age", "Job", "Salary"]

# Create DataFrame
print("\nCreating DataFrame...")
df = spark.createDataFrame(data, columns)

print("\n1. Show all data:")
df.show()

print("\n2. Show schema:")
df.printSchema()

print("\n3. Filter: Engineers only")
df.filter(col("Job") == "Engineer").show()

print("\n4. Select specific columns:")
df.select("Name", "Salary").show()

print("\n5. Average salary by Job:")
df.groupBy("Job").agg(avg("Salary").alias("Average_Salary")).show()

print("\n6. Add new column (Salary in thousands):")
df.withColumn("Salary_K", col("Salary") / 1000).show()

# Stop Spark
spark.stop()
print("\nâœ“ PySpark test completed successfully! ðŸŽ‰")