In [None]:
! pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812363 sha256=50391c96f18a0e0c59dd4488020648fa1d82dea1a47db52362f4217863eced49
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Retail Store") \
    .getOrCreate()

In [None]:
#1. Calculate the Total Revenue per Category
from pyspark.sql.functions import col, sum
df_retail = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/content/sample_data/retail_data.csv")
df_retail = df_retail.withColumn("total_revenue", col("price") * col("quantity"))
total_revenue_per_category = df_retail.groupBy("category").agg(sum("total_revenue").alias("total_revenue"))
total_revenue_per_category.show()

+-----------+------------------+
|   category|     total_revenue|
+-----------+------------------+
| Stationery|              20.0|
|  Groceries|13.399999999999999|
|Electronics|            1000.0|
|   Clothing|             155.0|
+-----------+------------------+



In [None]:
#2. Filter Transactions Where the Total Sales Amount is Greater Than $100
df_sales_amount = df_retail.withColumn("total_sales_amount", col("price") * col("quantity"))
df_high_value_transactions = df_sales_amount.filter(col("total_sales_amount") > 100)
df_high_value_transactions.show()

+--------------+------------+-----------+-----+--------+----------+-------------+------------------+
|transaction_id|product_name|   category|price|quantity|sales_date|total_revenue|total_sales_amount|
+--------------+------------+-----------+-----+--------+----------+-------------+------------------+
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|        800.0|             800.0|
|             7|  Headphones|Electronics|100.0|       2|2023-09-04|        200.0|             200.0|
+--------------+------------+-----------+-----+--------+----------+-------------+------------------+



In [None]:
#3. Find the Most Sold Product
from pyspark.sql.functions import sum
quantity_per_product = df_retail.groupBy("product_name").agg(sum("quantity").alias("total_quantity"))
most_sold_product = quantity_per_product.orderBy(col("total_quantity").desc()).limit(1)
most_sold_product.show()

+------------+--------------+
|product_name|total_quantity|
+------------+--------------+
|      Banana|            12|
+------------+--------------+



In [None]:
#4. Calculate the Average Price per Product Category
from pyspark.sql.functions import avg
average_price_per_category = df_retail.groupBy("category").agg(avg("price").alias("average_price"))
average_price_per_category.show()

+-----------+------------------+
|   category|     average_price|
+-----------+------------------+
| Stationery|               1.5|
|  Groceries|0.4666666666666666|
|Electronics|             450.0|
|   Clothing|              30.0|
+-----------+------------------+



In [None]:
#5. Find the Top 3 Highest Grossing Products
revenue_per_product = df_retail.groupBy("product_name").agg(sum("total_revenue").alias("total_revenue"))
top_3_products = revenue_per_product.orderBy(col("total_revenue").desc()).limit(3)
top_3_products.show()

+------------+-------------+
|product_name|total_revenue|
+------------+-------------+
|      Laptop|        800.0|
|  Headphones|        200.0|
|       Pants|         75.0|
+------------+-------------+



In [None]:
#6. Calculate the Total Number of Items Sold per Day
total_items_per_day = df_retail.groupBy("sales_date").agg(sum("quantity").alias("total_items_sold"))
total_items_per_day.show()

+----------+----------------+
|sales_date|total_items_sold|
+----------+----------------+
|2023-09-03|               4|
|2023-09-01|              12|
|2023-09-05|               9|
|2023-09-02|              17|
|2023-09-04|              12|
+----------+----------------+



In [None]:
#7. Identify the Product with the Lowest Price in Each Category
from pyspark.sql.functions import min
min_price_per_category = df_retail.groupBy("category").agg(min("price").alias("min_price"))
product_with_lowest_price = df_retail.alias("df1").join(
    min_price_per_category.alias("df2"),
    (col("df1.category") == col("df2.category")) & (col("df1.price") == col("df2.min_price"))
).select(
    col("df1.category"),
    col("df1.product_name"),
    col("df1.price")
)
product_with_lowest_price.show()

+-----------+------------+-----+
|   category|product_name|price|
+-----------+------------+-----+
|   Clothing|     T-shirt| 15.0|
|  Groceries|      Banana|  0.3|
|Electronics|  Headphones|100.0|
| Stationery|         Pen|  1.0|
+-----------+------------+-----+



In [None]:
#8. Calculate the Total Revenue for Each Product
total_revenue_per_product = df_retail.groupBy("product_name").agg(sum("total_revenue").alias("total_revenue"))
total_revenue_per_product.show()

+------------+------------------+
|product_name|     total_revenue|
+------------+------------------+
|     T-shirt|              30.0|
|    Sneakers|              50.0|
|      Orange|               4.8|
|      Banana|3.5999999999999996|
|         Pen|              10.0|
|       Pants|              75.0|
|      Laptop|             800.0|
|    Notebook|              10.0|
|       Apple|               5.0|
|  Headphones|             200.0|
+------------+------------------+



In [None]:
#9. Find the Total Sales per Day for Each Category
total_sales_per_day_category = df_retail.groupBy("sales_date", "category").agg(sum("total_revenue").alias("total_sales"))
total_sales_per_day_category.show()

+----------+-----------+------------------+
|sales_date|   category|       total_sales|
+----------+-----------+------------------+
|2023-09-03|Electronics|             800.0|
|2023-09-01|  Groceries|               5.0|
|2023-09-01|   Clothing|              30.0|
|2023-09-02| Stationery|              10.0|
|2023-09-04| Stationery|              10.0|
|2023-09-02|  Groceries|3.5999999999999996|
|2023-09-05|  Groceries|               4.8|
|2023-09-05|   Clothing|              50.0|
|2023-09-03|   Clothing|              75.0|
|2023-09-04|Electronics|             200.0|
+----------+-----------+------------------+



In [None]:
#10. Create a New Column for Discounted Price
df_retail = df_retail.withColumn("discounted_price", col("price") * 0.9)
df_retail.show()

+--------------+------------+-----------+-----+--------+----------+------------------+----------------+
|transaction_id|product_name|   category|price|quantity|sales_date|     total_revenue|discounted_price|
+--------------+------------+-----------+-----+--------+----------+------------------+----------------+
|             1|       Apple|  Groceries|  0.5|      10|2023-09-01|               5.0|            0.45|
|             2|     T-shirt|   Clothing| 15.0|       2|2023-09-01|              30.0|            13.5|
|             3|    Notebook| Stationery|  2.0|       5|2023-09-02|              10.0|             1.8|
|             4|      Banana|  Groceries|  0.3|      12|2023-09-02|3.5999999999999996|            0.27|
|             5|      Laptop|Electronics|800.0|       1|2023-09-03|             800.0|           720.0|
|             6|       Pants|   Clothing| 25.0|       3|2023-09-03|              75.0|            22.5|
|             7|  Headphones|Electronics|100.0|       2|2023-09-