You have two DataFrames:
df_sales: sale_id, product_id, sale_date, amount
df_products: product_id, product_name, category
Write PySpark code to:

Join the two DataFrames on product_id
Filter for sales from the last 30 days (assume today's date using current_date())
Calculate the total sales amount per category
Find the top 3 categories by total sales
Include a column showing each category's percentage of total sales

In [None]:
%python

from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, DateType, FloatType
from pyspark.sql.functions import col, date_sub, broadcast, current_date, round, sum as agg_sum, lit


spark = SparkSession.builder.appName("practice_questions").getOrCreate()

sales_schema = StructType([
    StructField("sale_id", StringType()),
    StructField("product_id", StringType()),
    StructField("sale_date", DateType()),
    StructField("amount", FloatType())
])

products_schema = StructType([
    StructField("product_id", StringType()),
    StructField("product_name", StringType()),
    StructField("category", StringType())
])

sales_df = spark.read.format("csv").schema(sales_schema).option("header", "True").load("dfbs:/raw/data/sales.csv")
products_df = spark.read.format("parquet").schema(products_df).load("dbfs:/raw/data/products.parquet")

sales_df = sales_df.filter(col("sale_date") < date_sub(current_date(), -30)) # filter early for predicate pushdown
joined_df = sales_df.join(broadcast(products_df), "product_id", "inner")
joined_df = joined_df.select("category", "amount") # predicate pushdown

joined_df = joined_df.groupBy("category").agg(agg_sum("amount").alias("total_sales"))
total_sale_sum = joined_df.agg(agg_sum("total_sales")).collect()[0][0]
window_spec = Window.orderBy(col("total_sales").desc())


ranked_df = joined_df.withColumn("percentage_sum", round(col("total_sales") * 100 / lit(total_sale_sum) , 2))
ranked_df = joined_df.dense_rank().over(window_spec).alias("category_rank")
top_df = ranked_df.filter(col("category_rank") <= 3)
top_df.show()