In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, month, year, avg

#  Initialize Spark Session

In [2]:
spark = SparkSession.builder.appName("RetailSalesCapstoneWeek3").getOrCreate()


# Load the large sales data

In [6]:
from google.colab import files
uploaded = files.upload()

Saving sales_data_large.csv to sales_data_large.csv


In [7]:
df = spark.read.csv("sales_data_large.csv", header=True, inferSchema=True)


#  Calculate revenue

In [12]:
df = df.withColumn("revenue", col("quantity") * col("price"))
df.show()

+-------+--------+----------+--------+-----+-----+-------+----------+-------+
|sale_id|store_id|product_id|quantity|price| cost|returns| sale_date|revenue|
+-------+--------+----------+--------+-----+-----+-------+----------+-------+
|      1|       5|         2|       3| 5000|50000|      3|2024-01-01|  15000|
|      2|       5|         5|       4|60000|70000|      1|2024-01-02| 240000|
|      3|       1|         5|       4|60000| 2000|      2|2024-01-03| 240000|
|      4|       2|         4|       5| 5000|70000|      0|2024-01-04|  25000|
|      5|       1|         3|       3| 5000| 4000|      3|2024-01-05|  15000|
|      6|       3|         1|       5|60000| 2000|      3|2024-01-06| 300000|
|      7|       2|         6|       3|60000|50000|      0|2024-01-07| 180000|
|      8|       5|         7|       1| 5000|10000|      4|2024-01-08|   5000|
|      9|       5|         7|       4| 5000| 2000|      3|2024-01-09|  20000|
|     10|       1|         2|       1| 5000| 2000|      4|2024-0


#  Filter underperforming products:
# Criteria: revenue < 20000 OR returns > 3

In [13]:
underperforming = df.filter((col("revenue") < 20000) | (col("returns") > 3))
df.show()

+-------+--------+----------+--------+-----+-----+-------+----------+-------+
|sale_id|store_id|product_id|quantity|price| cost|returns| sale_date|revenue|
+-------+--------+----------+--------+-----+-----+-------+----------+-------+
|      1|       5|         2|       3| 5000|50000|      3|2024-01-01|  15000|
|      2|       5|         5|       4|60000|70000|      1|2024-01-02| 240000|
|      3|       1|         5|       4|60000| 2000|      2|2024-01-03| 240000|
|      4|       2|         4|       5| 5000|70000|      0|2024-01-04|  25000|
|      5|       1|         3|       3| 5000| 4000|      3|2024-01-05|  15000|
|      6|       3|         1|       5|60000| 2000|      3|2024-01-06| 300000|
|      7|       2|         6|       3|60000|50000|      0|2024-01-07| 180000|
|      8|       5|         7|       1| 5000|10000|      4|2024-01-08|   5000|
|      9|       5|         7|       4| 5000| 2000|      3|2024-01-09|  20000|
|     10|       1|         2|       1| 5000| 2000|      4|2024-0

#  Group by store and calculate average monthly revenue

In [14]:
df_with_date = df.withColumn("month", month(col("sale_date"))).withColumn("year", year(col("sale_date")))
monthly_revenue = (
    df_with_date
    .groupBy("store_id", "year", "month")
    .agg(avg("revenue").alias("avg_monthly_revenue"))
    .orderBy("store_id", "year", "month")
)
df.show()

+-------+--------+----------+--------+-----+-----+-------+----------+-------+
|sale_id|store_id|product_id|quantity|price| cost|returns| sale_date|revenue|
+-------+--------+----------+--------+-----+-----+-------+----------+-------+
|      1|       5|         2|       3| 5000|50000|      3|2024-01-01|  15000|
|      2|       5|         5|       4|60000|70000|      1|2024-01-02| 240000|
|      3|       1|         5|       4|60000| 2000|      2|2024-01-03| 240000|
|      4|       2|         4|       5| 5000|70000|      0|2024-01-04|  25000|
|      5|       1|         3|       3| 5000| 4000|      3|2024-01-05|  15000|
|      6|       3|         1|       5|60000| 2000|      3|2024-01-06| 300000|
|      7|       2|         6|       3|60000|50000|      0|2024-01-07| 180000|
|      8|       5|         7|       1| 5000|10000|      4|2024-01-08|   5000|
|      9|       5|         7|       4| 5000| 2000|      3|2024-01-09|  20000|
|     10|       1|         2|       1| 5000| 2000|      4|2024-0

#  Save outputs

In [11]:
underperforming.write.csv("underperforming_products_output", header=True, mode="overwrite")
monthly_revenue.write.csv("monthly_revenue_summary_output", header=True, mode="overwrite")

In [15]:
spark.stop()