In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_set, col
from dotenv import load_dotenv
import os
from pyspark.ml.fpm import FPGrowth

# Load environment variables
load_dotenv()

# Initialize Spark session
spark = SparkSession.builder \
    .appName("MarketBasketAnalysis") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.1") \
    .config("fs.azure.account.key.dataintensiveproject.blob.core.windows.net", os.getenv("AZURE_STORAGE_KEY")) \
    .getOrCreate()


Python-dotenv could not parse statement starting at line 2
Python-dotenv could not parse statement starting at line 3
Python-dotenv could not parse statement starting at line 4


In [2]:
# 📂 Step 2: Read cleaned dataset from Azure Blob Storage
# Update this path if your container or file path is different
data_path = "wasbs://retail-data@dataintensiveproject.blob.core.windows.net/clean/retail_cleaned"
retail_df = spark.read.parquet(data_path)


25/04/21 21:00:10 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-azure-file-system.properties,hadoop-metrics2.properties


In [6]:
# 🧹 Step 3: Filter invalid transactions (Quantity > 0 and valid InvoiceNo)
transactions_df = retail_df.filter(
    (col("InvoiceNo").isNotNull()) & 
    (col("Quantity") > 0)
)

In [7]:
# 🧺 Step 4: Group by InvoiceNo to create item baskets
basket_df = transactions_df.groupBy("InvoiceNo") \
    .agg(collect_set("Description").alias("items"))

In [10]:
# 🧠 Step 5: Train FP-Growth Model
fp_growth = FPGrowth(itemsCol="items", minSupport=0.01, minConfidence=0.2)
model = fp_growth.fit(basket_df)

                                                                                

In [11]:
# 📊 Step 6: Show Frequent Itemsets
print("🔍 Frequent Itemsets:")
frequent_itemsets = model.freqItemsets
frequent_itemsets.orderBy("freq", ascending=False).show(truncate=False)


🔍 Frequent Itemsets:


[Stage 12:>                                                         (0 + 8) / 8]

+------------------------------------+----+
|items                               |freq|
+------------------------------------+----+
|[WHITE HANGING HEART T-LIGHT HOLDER]|1971|
|[REGENCY CAKESTAND 3 TIER]          |1703|
|[JUMBO BAG RED RETROSPOT]           |1600|
|[PARTY BUNTING]                     |1379|
|[ASSORTED COLOUR BIRD ORNAMENT]     |1375|
|[LUNCH BAG RED RETROSPOT]           |1288|
|[SET OF 3 CAKE TINS PANTRY DESIGN ] |1146|
|[POSTAGE]                           |1099|
|[LUNCH BAG  BLACK SKULL.]           |1052|
|[PACK OF 72 RETROSPOT CAKE CASES]   |1029|
|[SPOTTY BUNTING]                    |1003|
|[LUNCH BAG SPACEBOY DESIGN ]        |988 |
|[PAPER CHAIN KIT 50'S CHRISTMAS ]   |980 |
|[NATURAL SLATE HEART CHALKBOARD ]   |967 |
|[LUNCH BAG CARS BLUE]               |966 |
|[HEART OF WICKER SMALL]             |961 |
|[LUNCH BAG PINK POLKADOT]           |931 |
|[LUNCH BAG SUKI DESIGN ]            |899 |
|[JAM MAKING SET WITH JARS]          |878 |
|[ALARM CLOCK BAKELIKE RED ]    

                                                                                

In [12]:
# 🔁 Step 7: Show Association Rules
print("🔗 Association Rules:")
association_rules = model.associationRules
association_rules.orderBy("confidence", ascending=False).show(truncate=False)


🔗 Association Rules:


[Stage 16:>                                                         (0 + 8) / 8]

+--------------------------------------------------------------------------------------------+------------------------------------+------------------+------------------+--------------------+
|antecedent                                                                                  |consequent                          |confidence        |lift              |support             |
+--------------------------------------------------------------------------------------------+------------------------------------+------------------+------------------+--------------------+
|[POPPY'S PLAYHOUSE LIVINGROOM , POPPY'S PLAYHOUSE BEDROOM ]                                 |[POPPY'S PLAYHOUSE KITCHEN]         |0.9073170731707317|48.596531791907516|0.010036693287286855|
|[PINK REGENCY TEACUP AND SAUCER, ROSES REGENCY TEACUP AND SAUCER , REGENCY CAKESTAND 3 TIER]|[GREEN REGENCY TEACUP AND SAUCER]   |0.9018867924528302|24.18779455533408 |0.012896611266997626|
|[REGENCY TEA PLATE PINK]                    

                                                                                

In [13]:
# 🛒 Step 8: Optional - Predict Recommendations for each basket
print("💡 Predicted Items for each basket:")
predictions = model.transform(basket_df)
predictions.select("InvoiceNo", "prediction").show(10, truncate=False)


💡 Predicted Items for each basket:


[Stage 23:>                                                         (0 + 1) / 1]

+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|InvoiceNo|prediction                                                                                                                                                                                                                                                                                                                                                                        

                                                                                

In [16]:
basket_df.toPandas().to_csv("PowerBI/market_basket_rules.csv", index=False)

                                                                                