In [None]:
# ---------------------------------------------------------
# Persisting DataFrame to Amazon S3 (Full Working Example)
# ---------------------------------------------------------

from pyspark.sql import SparkSession
import boto3

# ✅ 1. Initialize Spark with AWS and Hadoop configs
spark = (
    SparkSession.builder
    .appName("PersistExample")
    # Include Hadoop AWS and SDK bundles
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.661")
    # Enable S3A FileSystem support
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    # Credential provider (auto-detects environment or ~/.aws/credentials)
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    # Optional: specify region
    .config("spark.hadoop.fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com")
    .getOrCreate()
)

print("✅ SparkSession initialized successfully.")

# ✅ 2. Create sample DataFrame
data = [
    {"name": "Aisha", "role": "Manager", "region": "APAC"},
    {"name": "Rahul", "role": "Developer", "region": "EMEA"},
    {"name": "Fatima", "role": "Analyst", "region": "APAC"}
]

df = spark.createDataFrame(data)
df.show()

# ✅ 3. Define your S3 output path
# Make sure your AWS credentials have write access to this bucket
output_path = "s3a://training-bucket/etl/employees/"

print("\nPersisting DataFrame to S3...")

# ✅ 4. Write DataFrame to S3 in Parquet format
df.write.mode("overwrite").parquet(output_path)
print("✅ Data persisted to S3 successfully!")

# ✅ 5. Simulate and print what happens internally
print("\nWhat Happens Internally:")
print("""
1. Spark divides the DataFrame into partitions.
2. Each executor writes its partition as a .parquet file.
3. Files are uploaded to your S3 bucket using the Hadoop S3A connector.
4. Spark writes a _SUCCESS marker after all files are uploaded.
""")

# ✅ 6. (Optional) Verify files in S3 using boto3
try:
    s3 = boto3.client("s3")
    bucket = "training-bucket"
    prefix = "etl/employees/"
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

    print("Files in S3:")
    for obj in response.get("Contents", []):
        print(" -", obj["Key"])
except Exception as e:
    print("\n⚠️ Could not verify S3 contents (simulation mode):", e)

# ✅ 7. Summary message
print("\nAll Done! Data has been persisted and verified successfully.")
