In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum

In [0]:
spark = SparkSession.builder.appName("ContinentRevenue").getOrCreate()

In [0]:
# Update the paths below
sales_path =r"/Volumes/sample/default/retail_sales/retail sales.csv"
continent_path =r"/Volumes/sample/default/retail_sales/Countries by continents.csv"

In [0]:
sales_df = spark.read.csv(sales_path, header=True, inferSchema=True)
continent_df = spark.read.csv(continent_path, header=True, inferSchema=True)

In [0]:
# Revenue calculation
sales_df = sales_df.withColumn("Revenue", col("Unit Price") * col("Units Sold"))

In [0]:
# Rename columns for join
sales_df = sales_df.withColumnRenamed("Country", "country")
continent_df = continent_df.withColumnRenamed("Country", "country").withColumnRenamed("Continent", "continent")

In [0]:
# Join and aggregate
joined_df = sales_df.join(continent_df, on="country", how="inner")
continent_revenue_df = joined_df.groupBy("continent").agg(_sum("Revenue").alias("Total_Revenue"))
continent_revenue_df.orderBy("continent").show(truncate=False)

+-------------+--------------------+
|continent    |Total_Revenue       |
+-------------+--------------------+
|Africa       |3.2503011620000005E7|
|Asia         |2.773613263E7       |
|Europe       |3.627485863000001E7 |
|North America|1.4813742040000001E7|
|Oceania      |6935976.66          |
+-------------+--------------------+

