# hello

In [1]:
print("hello")

hello


In [2]:
import pandas as pd

# Path to dataset folder
DATA_PATH = "e_commerce_HW3_dataset/"

# Load datasets
aisles = pd.read_csv(DATA_PATH + "aisles.csv")
departments = pd.read_csv(DATA_PATH + "departments.csv")
products = pd.read_csv(DATA_PATH + "products.csv")
orders = pd.read_csv(DATA_PATH + "orders.csv")
order_products_prior = pd.read_csv(DATA_PATH + "order_products__prior.csv")
order_products_train = pd.read_csv(DATA_PATH + "order_products__train.csv")

# Quick check
print("Datasets loaded successfully:")
print(f"aisles: {aisles.shape}")
print(f"departments: {departments.shape}")
print(f"products: {products.shape}")
print(f"orders: {orders.shape}")
print(f"order_products_prior: {order_products_prior.shape}")
print(f"order_products_train: {order_products_train.shape}")


Datasets loaded successfully:
aisles: (134, 2)
departments: (21, 2)
products: (49688, 4)
orders: (3421083, 7)
order_products_prior: (32434489, 4)
order_products_train: (1384617, 4)


In [3]:
orders = orders.drop(columns=['days_since_prior_order'])


In [4]:
order_sizes = order_products_prior.groupby('order_id').size()
valid_orders = order_sizes[order_sizes >= 2].index

order_products_prior = order_products_prior[
    order_products_prior['order_id'].isin(valid_orders)
]


In [5]:
order_products_prior = order_products_prior.merge(
    products[['product_id', 'product_name']],
    on='product_id',
    how='left'
)


In [6]:
sample_orders = order_products_prior['order_id'].drop_duplicates().sample(20000, random_state=42)

order_products_sample = order_products_prior[
    order_products_prior['order_id'].isin(sample_orders)
]


In [7]:
order_products_sample

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name
784,91,5825,1,1,Organic Celery
785,91,44303,2,1,Organic Shredded Unsweetened Coconut
786,91,48784,3,1,Lightly Breaded Fish Sticks
787,91,23293,4,1,Quinoa & Leeks with Chicken + Tarragon Organic...
788,91,42813,5,1,Smoked Maple Ham
...,...,...,...,...,...
32276174,3420917,49683,21,1,Cucumber Kirby
32276175,3420917,27458,22,1,Panko Japanese Style Bread Crumbs
32277304,3421032,1685,1,1,Clean Care 1-Ply Double Rolls Toilet Paper
32277305,3421032,3007,2,0,Natural Anticavity Silly Strawberry Fluoride T...


# Task II

In [8]:
# Group products by order_id to create baskets
baskets = (
    order_products_sample
    .groupby('order_id')['product_name']
    .apply(list)
)

print("Number of baskets:", baskets.shape[0])
print("Example basket:", baskets.iloc[0])


Number of baskets: 20000
Example basket: ['Organic Celery', 'Organic Shredded Unsweetened Coconut', 'Lightly Breaded Fish Sticks', 'Quinoa & Leeks with Chicken + Tarragon Organic Baby Food', 'Smoked Maple Ham', 'Organic Avocados']


In [9]:
# Convert baskets to list of lists (transactions)
transactions = baskets.tolist()


In [10]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_array = te.fit(transactions).transform(transactions)

basket_df = pd.DataFrame(
    te_array,
    columns=te.columns_
)

basket_df.head()


Unnamed: 0,#2 Coffee Filters,#4 Natural Brown Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,+Energy Black Cherry Vegetable & Fruit Juice,0 Calorie Fuji Apple Pear Water Beverage,0 Calorie Strawberry Dragonfruit Water Beverage,0% Fat Black Cherry Greek Yogurt y,0% Fat Blueberry Greek Yogurt,0% Fat Free Organic Milk,0% Fat Greek Yogurt Black Cherry on the Bottom,...,smartwater® Electrolyte Enhanced Water,vitaminwater® XXX Acai Blueberry Pomegranate,with Crispy Almonds Cereal,with Dawn Action Pacs Fresh Scent Dishwasher Detergent Pacs,with Olive Oil Mayonnaise,with Olive Oil Mayonnaise Dressing,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Island Berry Lime 18 Sticks Sugar Free Gum,with a Splash of Mango Coconut Water,with a Splash of Pineapple Coconut Water
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
print("Number of transactions:", basket_df.shape[0])
print("Number of unique products:", basket_df.shape[1])


Number of transactions: 20000
Number of unique products: 21948


# Task III

In [13]:
import pandas as pd
from mlxtend.frequent_patterns import apriori

# --- (Recommended) convert True/False to 0/1 to reduce memory and avoid warnings ---
basket_bin = basket_df.astype("uint8")

def run_apriori_and_summarize(data, min_support: float):
    """
    Runs Apriori and returns:
    - frequent_itemsets dataframe
    - summary dictionary (count + max length + avg length)
    """
    frequent_itemsets = apriori(
        data,
        min_support=min_support,
        use_colnames=True,
        max_len=None,      # can set e.g. max_len=3 if runtime is too high
        low_memory=True
    )

    # Add itemset length for analysis
    frequent_itemsets["itemset_size"] = frequent_itemsets["itemsets"].apply(len)

    summary = {
        "min_support": min_support,
        "num_itemsets": int(frequent_itemsets.shape[0]),
        "max_itemset_size": int(frequent_itemsets["itemset_size"].max()) if not frequent_itemsets.empty else 0,
        "avg_itemset_size": float(frequent_itemsets["itemset_size"].mean()) if not frequent_itemsets.empty else 0.0
    }
    return frequent_itemsets, summary

# --- Run Apriori for two support thresholds ---
freq_001, summary_001 = run_apriori_and_summarize(basket_bin, min_support=0.01)
freq_005, summary_005 = run_apriori_and_summarize(basket_bin, min_support=0.05)

# --- Compare results ---
comparison = pd.DataFrame([summary_001, summary_005])
print(comparison)

# --- (Optional) Top 15 most frequent itemsets for each support ---
print("\nTop itemsets (min_support=0.01):")
display(freq_001.sort_values("support", ascending=False).head(15))

print("\nTop itemsets (min_support=0.05):")
display(freq_005.sort_values("support", ascending=False).head(15))




   min_support  num_itemsets  max_itemset_size  avg_itemset_size
0         0.01           128                 2          1.140625
1         0.05             7                 1          1.000000

Top itemsets (min_support=0.01):


Unnamed: 0,support,itemsets,itemset_size
6,0.15665,(Banana),1
5,0.12735,(Bag of Organic Bananas),1
79,0.0855,(Organic Strawberries),1
40,0.07675,(Organic Baby Spinach),1
60,0.0685,(Organic Hass Avocado),1
36,0.0575,(Organic Avocado),1
31,0.0527,(Large Lemon),1
102,0.04695,(Strawberries),1
68,0.0456,(Organic Raspberries),1
33,0.04455,(Limes),1



Top itemsets (min_support=0.05):


Unnamed: 0,support,itemsets,itemset_size
1,0.15665,(Banana),1
0,0.12735,(Bag of Organic Bananas),1
6,0.0855,(Organic Strawberries),1
4,0.07675,(Organic Baby Spinach),1
5,0.0685,(Organic Hass Avocado),1
3,0.0575,(Organic Avocado),1
2,0.0527,(Large Lemon),1


## Task 3 – Conceptual Questions

### 1. What should be the final value of `min_support`?

Based on the experimental results, a minimum support value of **0.01** is the most appropriate choice for this dataset.

When `min_support = 0.01`, the Apriori algorithm identifies a larger number of frequent itemsets, including multi-item combinations. These itemsets capture meaningful co-occurrence patterns between products, which are essential for market basket analysis and recommender systems.

In contrast, using a higher support threshold such as `min_support = 0.05` results in only a small number of single-item itemsets and removes all multi-item patterns. This significantly reduces the analytical value of the results.

Therefore, **`min_support = 0.01`** provides a better balance between pattern richness and computational efficiency.

---

### 2. Why does increasing `min_support` reduce the number of itemsets?

Increasing the minimum support threshold imposes a stricter requirement on how frequently an itemset must appear in transactions to be considered frequent.

Larger itemsets naturally occur less often than smaller ones. As a result, when the support threshold increases, multi-item combinations are filtered out first. Only the most frequently occurring individual items remain.

This leads to:
- Fewer frequent itemsets overall
- Smaller maximum itemset sizes
- Loss of potentially interesting co-purchase patterns

In summary, higher `min_support` values simplify the results but reduce the ability to discover meaningful associations between products.
