## Import the Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from mlxtend.preprocessing import TransactionEncoder
import json
import re

## Read data and Display

In [None]:
# Đọc dữ liệu từ file CSV
store_data = pd.read_csv("/content/sample_data/store_data.csv", header=None)
store_data_transaction = store_data
display(store_data.head())
print(store_data.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
1,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,turkey,avocado,,,,,,,,,,,,,,,,,,
4,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


(7501, 20)


In [None]:
# Bước 2: Làm sạch dữ liệu
# Bỏ các dòng toàn NaN (dòng nhiễu, không có sản phẩm nào)
store_data = store_data.dropna(how='all')
display(store_data.head())
print(store_data.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,shrimp,almonds,avocado,vegetables mix,green grapes,whole weat flour,yams,cottage cheese,energy drink,tomato juice,low fat yogurt,green tea,honey,salad,mineral water,salmon,antioxydant juice,frozen smoothie,spinach,olive oil
2,burgers,meatballs,eggs,,,,,,,,,,,,,,,,,
3,chutney,,,,,,,,,,,,,,,,,,,
4,turkey,avocado,,,,,,,,,,,,,,,,,,
5,mineral water,milk,energy bar,whole wheat rice,green tea,,,,,,,,,,,,,,,


(7501, 20)


In [None]:
cleaned_transactions = []
for index, row in store_data.iterrows():
    transaction = row.dropna().tolist()

    # Chuẩn hóa: chuyển về lowercase, bỏ khoảng trắng thừa, loại bỏ ký tự đặc biệt
    transaction = [re.sub(r'[^a-zA-Z\s]', '', item).strip().lower() for item in transaction]

    # Bỏ item trống (sau khi làm sạch nếu rỗng)
    transaction = [item for item in transaction if item]

    # Bỏ duplicate trong cùng 1 transaction
    transaction = list(set(transaction))

    cleaned_transactions.append(transaction)

cleaned_transactions_df = pd.DataFrame(cleaned_transactions)
display(cleaned_transactions_df.head())
print(cleaned_transactions_df.shape)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,avocado,olive oil,frozen smoothie,honey,green grapes,vegetables mix,antioxydant juice,cottage cheese,spinach,whole weat flour,yams,salmon,almonds,energy drink,tomato juice,low fat yogurt,mineral water,green tea,salad,shrimp
1,eggs,burgers,meatballs,,,,,,,,,,,,,,,,,
2,chutney,,,,,,,,,,,,,,,,,,,
3,avocado,turkey,,,,,,,,,,,,,,,,,,
4,milk,whole wheat rice,mineral water,green tea,energy bar,,,,,,,,,,,,,,,


(7501, 20)


In [None]:
# Bước 3: Convert sang JSON
transactions_json = { "transactions": cleaned_transactions }

# Bước 4: Ghi ra file
with open("store_data.json", "w") as f:
    json.dump(transactions_json, f, indent=4)

In [None]:
transactions = []
for index, row in store_data_transaction.iterrows():
    items = row.dropna().astype(str).str.strip().tolist()
    if items:
        transactions.append(items)

# Đếm số lần xuất hiện của mỗi sản phẩm
flat_items = [item for transaction in transactions for item in transaction]
item_counts = Counter(flat_items)

# Đếm số sản phẩm khác nhau
num_unique_products = len(item_counts)
print(f"Tổng số sản phẩm khác nhau: {num_unique_products}")

# Lấy top 50 sản phẩm bán chạy nhất
top_items = item_counts.most_common(50)

print("\nTop 50 sản phẩm bán chạy nhất:")
for item, count in top_items:
    print(f"{item}")

Tổng số sản phẩm khác nhau: 119

Top 50 sản phẩm bán chạy nhất:
mineral water
eggs
spaghetti
french fries
chocolate
green tea
milk
ground beef
frozen vegetables
pancakes
burgers
cake
cookies
escalope
low fat yogurt
shrimp
tomatoes
olive oil
frozen smoothie
turkey
chicken
whole wheat rice
grated cheese
cooking oil
soup
herb & pepper
honey
champagne
fresh bread
salmon
brownies
avocado
hot dogs
cottage cheese
tomato juice
butter
whole wheat pasta
red wine
yogurt cake
light mayo
energy bar
ham
energy drink
pepper
vegetables mix
cereals
muffins
oil
french wine
fresh tuna
