# 📊 Samsung Sales ETL Pipeline
This notebook performs an ETL process on mock Samsung sales data using PySpark.

In [None]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, round as _round

# Initialize Spark session
spark = SparkSession.builder.appName("SamsungSalesETL").getOrCreate()


In [None]:

# Load CSV data
sales_df = spark.read.option("header", True).option("inferSchema", True).csv("sales_data.csv")
product_df = spark.read.option("header", True).option("inferSchema", True).csv("product_catalog.csv")
region_df = spark.read.option("header", True).option("inferSchema", True).csv("store_regions.csv")

sales_df.show(5)
product_df.show()
region_df.show()


In [None]:

# Join sales with product and region data
sales_product_df = sales_df.join(product_df, on="model", how="left")
full_df = sales_product_df.join(region_df, on="store_id", how="left")


In [None]:

# Add total revenue column
full_df = full_df.withColumn("total_revenue", _round(col("quantity") * col("unit_price"), 2))
full_df.select("sale_id", "model", "quantity", "unit_price", "total_revenue").show(5)


In [None]:

# Group by model and country
summary_df = full_df.groupBy("model", "country").agg(
    _sum("quantity").alias("total_units_sold"),
    _round(_sum("total_revenue"), 2).alias("total_revenue")
)
summary_df.show()


In [None]:

# Write summary to Parquet
summary_df.write.mode("overwrite").parquet("data/processed/sales_summary.parquet")
