In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Hive Query Example") \
    .config("spark.sql.warehouse.dir", "/user/hive/warehouse") \
    .enableHiveSupport() \
    .getOrCreate()

In [2]:
# SQL query to get the most selling products

most_selling_products_query = """
SELECT t.product_id, p.product_name, SUM(t.units) AS total_units_sold
FROM BigData_DWH.Transactions_Fact_table t
JOIN BigData_DWH.products_dimension p
ON t.product_id = p.product_id
GROUP BY t.product_id, p.product_name
ORDER BY total_units_sold DESC
"""

most_selling_products = spark.sql(most_selling_products_query)

print("Most Selling Products:")
most_selling_products.show()

Most Selling Products:
+----------+-----------------+----------------+
|product_id|     product_name|total_units_sold|
+----------+-----------------+----------------+
|        22|     Coffee Maker|          251658|
|        25|  Washing Machine|          209196|
|        16|            Skirt|          207393|
|        29|Hair Straightener|          190096|
|         7|            Dress|          187128|
|        11|               TV|          183752|
|        19|          Sandals|          182744|
|         9|            Boots|          181675|
|        24|          Blender|          179452|
|        17|           Blouse|          178602|
|        27|             Iron|          173329|
|         6|            Jeans|          172216|
|         5|          T-Shirt|          166782|
|        23|          Toaster|          166665|
|        13|          Printer|          164248|
|        28|       Hair Dryer|          162360|
|         1|           Laptop|          156927|
|         4|     

In [3]:
# Most Redeemed Offers from Customers
most_redeemed_offers_query = """
SELECT offer, COUNT(*) AS total_redemptions
FROM BigData_DWH.orders_dimension
WHERE offer IS NOT NULL
GROUP BY offer
ORDER BY total_redemptions DESC
"""

most_redeemed_offers = spark.sql(most_redeemed_offers_query)

print("Most Redeemed Offers from Customers:")
most_redeemed_offers.show()


Most Redeemed Offers from Customers:
+-------+-----------------+
|  offer|total_redemptions|
+-------+-----------------+
|offer_4|              494|
|offer_3|              459|
|offer_2|              444|
|offer_5|              434|
|offer_1|              427|
+-------+-----------------+



In [4]:
# Most Redeemed Offers per Product
most_redeemed_offers_per_product_query = """
SELECT p.product_id, p.product_name, o.offer, COUNT(*) AS total_redemptions
FROM BigData_DWH.orders_dimension o
JOIN BigData_DWH.Transactions_Fact_table t ON o.transaction_id = t.transaction_id
JOIN BigData_DWH.products_dimension p ON t.product_id = p.product_id
WHERE o.offer IS NOT NULL
GROUP BY p.product_id, p.product_name, o.offer
ORDER BY total_redemptions DESC
"""

most_redeemed_offers_per_product = spark.sql(most_redeemed_offers_per_product_query)

print("Most Redeemed Offers per Product:")
most_redeemed_offers_per_product.show()

Most Redeemed Offers per Product:
+----------+-----------------+-------+-----------------+
|product_id|     product_name|  offer|total_redemptions|
+----------+-----------------+-------+-----------------+
|        11|               TV|offer_4|             7416|
|        24|          Blender|offer_5|             6293|
|        19|          Sandals|offer_4|             6148|
|        29|Hair Straightener|offer_4|             5886|
|         5|          T-Shirt|offer_1|             5852|
|        22|     Coffee Maker|offer_3|             5658|
|        23|          Toaster|offer_5|             5125|
|        25|  Washing Machine|offer_2|             4914|
|         6|            Jeans|offer_1|             4738|
|        25|  Washing Machine|offer_3|             4680|
|        22|     Coffee Maker|offer_1|             4674|
|        22|     Coffee Maker|offer_4|             4674|
|         2|       Smartphone|offer_4|             4650|
|        16|            Skirt|offer_4|             459

In [5]:
online_sales_summary_query = """
SELECT city, SUM(Total_price_after_discount) AS total_online_sales
FROM (
    SELECT split(split(orders_dimension.shipping_address, '/')[1], ' ')[0] AS city, 
           t.Total_price_after_discount
    FROM BigData_DWH.orders_dimension
    JOIN BigData_DWH.Transactions_Fact_table t ON orders_dimension.transaction_id = t.transaction_id
    WHERE orders_dimension.is_online = 'yes'
) combined_df
GROUP BY city
ORDER BY total_online_sales ASC
LIMIT 10
"""

online_sales_summary_result = spark.sql(online_sales_summary_query)

print("Cities with the lowest online sales:")
online_sales_summary_result.show()

Cities with the lowest online sales:
+-----------+------------------+
|       city|total_online_sales|
+-----------+------------------+
|Smartsville|             19.99|
| Northfield|            19.992|
|   Franklin|            59.985|
|    Reading|            67.473|
| Marshfield| 67.47749999999999|
|   Falmouth| 75.96199999999999|
|   Redlands|            95.968|
|  Castleton| 99.94999999999999|
|    Fortuna|           101.966|
|     Dublin|           107.964|
+-----------+------------------+



In [6]:
# Stop Spark session
spark.stop()