In [1]:
def fp(df, filter_value, min_support):
    import pandas as pd
    from mlxtend.preprocessing import TransactionEncoder
    from mlxtend.frequent_patterns import fpgrowth
    if filter_value == 1:
        data = df[['Category Name']]
    elif filter_value == 2:
        data = df[['SubCategory Name']]
    # create a TransactionEncoder object
    data = data.values.tolist()

    data = [x[0].split(',') for x in data]

    te = TransactionEncoder()

    # transform the data into a list of lists
    transactions = te.fit(data).transform(data)
    transactions = pd.DataFrame(transactions, columns=te.columns_)

    # Calculate frequent itemsets using FP-Growth Algorithm
    frequent_itemsets = fpgrowth(transactions, min_support=min_support, use_colnames=True)

    return frequent_itemsets

In [2]:
import configparser
config = configparser.ConfigParser()
config.read('config.ini')
filter_value = config['filter']['filter_value']
min_support = config['FPGAlgorithm']['min_support']
FPG_data_file = config['FPGAlgorithm']['FPG_data_file']

print("filter_value: ", filter_value)
print("min_support: ", min_support)
print("FPG_data_file: ", FPG_data_file)


filter_value:  1
min_support:  0.005
FPG_data_file:  C:\Users\Trieu Pham\OneDrive - BTM Global Consulting\Projects\data csv\outputFile\data.csv


In [3]:
# Read data from csv file
import pandas as pd
try:
    df = pd.read_csv(FPG_data_file)
except FileNotFoundError:
    print("File not found")
    exit()
df.dropna(inplace=True)

In [4]:
df


Unnamed: 0,StoreID,Date,InvoiceID,ItemName,Barcode,Category Name,SubCategory Name
0,VN0001,2023-01-01,VN0001010101230001,DASANI Nước khoáng 510ml x1 Chai,8935049500544,Bottled Water,Mineral water
1,VN0001,2023-01-01,VN0001010101230002,SATORI Nước tinh khiết 500ml x 1 Chai,8938512632025,Bottled Water,Purified water
2,VN0001,2023-01-01,VN0001010101230003,Celano Kem Bánh Cá Trân Châu Dừa Tắc 70g * 1 cây,8936011773416,Packaged Ice Cream/Novelties,Other Ice Cream
3,VN0001,2023-01-01,VN0001010101230003,LAYS WAVY Khoai tây bò Texas 63g x gói,8936079121761,Potato Chips,Potato Snacks
4,VN0001,2023-01-01,VN0001010101230003,MILO Kem que Magma 55g x1 Cây,8850453017528,Packaged Ice Cream/Novelties,Stick
...,...,...,...,...,...,...,...
21308937,VN0243,2023-04-30,VN0243023004230048,THTM Không đường 180ml x1 Hộp,8935217400058,Whole Milk,UHT Milk
21308939,VN0243,2023-04-30,VN0243023004230049,Cơm nắm kim chi phô mai,2270102000361,Rice Offsite,Onigiri
21308940,VN0243,2023-04-30,VN0243023004230049,FUZE TEA+ Trà Chanh với Sả 450ml x 1 Chai,8935049501190,Tea,Sugar Tea
21308941,VN0243,2023-04-30,VN0243023004230049,FUZETEA+ Trà đào hạt chia 450ml x 1 Chai,8935049500698,Tea,Sugar Tea


In [5]:
# List of unique stores
stores = df['StoreID'].unique()

# Sum number of stores
len(stores)


219

In [8]:
try:
    df_filter = df[['InvoiceID','Category Name']]
    df_filter = df_filter[df_filter.groupby('InvoiceID')['Category Name'].transform('size') > 1]
    dataset_str = df_filter.groupby('InvoiceID').agg({
        'Category Name': lambda x: ','.join(str(i) for i in x)
    }).reset_index()  
except:
    df_filter = df[['InvoiceID','SubCategory Name']]
    df_filter = df_filter[df_filter.groupby('InvoiceID')['SubCategory Name'].transform('size') > 1]
    dataset_str = df_filter.groupby('InvoiceID').agg({
        'SubCategory Name': lambda x: ','.join(str(i) for i in x)
    }).reset_index()

In [9]:
dataset_str

Unnamed: 0,InvoiceID,Category Name
0,VN0001010101230001,"Bottled Water,Bottled Water"
1,VN0001010101230002,"Bottled Water,Bottled Water"
2,VN0001010101230003,"Packaged Ice Cream/Novelties,Potato Chips,Pack..."
3,VN0001010101230004,"Liquor,Cold Beverages,Add-on item Cat,Liquor,C..."
4,VN0001010101230005,"Sports Drinks,Sports Drinks,Sports Drinks,Spor..."
...,...,...
5270664,VN9996023101230486,"Sports Drinks,Sports Drinks,Energy Drinks"
5270665,VN9996023101230488,"Rice Offsite,Cold Beverages,Energy Drinks"
5270666,VN9996023101230489,"Potato Chips,Cold Beverages,Liquor"
5270667,VN9996023101230492,"Cigarettes Category,Cold Beverages"


In [11]:
# Get StoreID from InvoiceID
dataset_str['StoreID'] = dataset_str['InvoiceID'].map(lambda x: x[:6])
# Convert the date string to a datetime object and extract the month
dataset_str['Month'] = dataset_str['InvoiceID'].str[10:12].map(lambda x: int(x[:2]))

In [13]:
test_df = dataset_str[dataset_str['StoreID'] == 'VN0001']
test_df = test_df[test_df['Month'] == 1]
test_df

Unnamed: 0,InvoiceID,Category Name,StoreID,Month
0,VN0001010101230001,"Bottled Water,Bottled Water",VN0001,1
1,VN0001010101230002,"Bottled Water,Bottled Water",VN0001,1
2,VN0001010101230003,"Packaged Ice Cream/Novelties,Potato Chips,Pack...",VN0001,1
3,VN0001010101230004,"Liquor,Cold Beverages,Add-on item Cat,Liquor,C...",VN0001,1
4,VN0001010101230005,"Sports Drinks,Sports Drinks,Sports Drinks,Spor...",VN0001,1
...,...,...,...,...
40123,VN0001023101230414,"Energy Drinks,Energy Drinks",VN0001,1
40124,VN0001023101230416,"Bottled Water,Bottled Water",VN0001,1
40125,VN0001023101230418,"Packaged Ice Cream/Novelties,Packaged Ice Crea...",VN0001,1
40126,VN0001023101230419,"Carbonated Drinks - CSD,Bottled Water,Carbonat...",VN0001,1


In [15]:
test_df = test_df[['InvoiceID','Category Name']]
test_df

Unnamed: 0,InvoiceID,Category Name
0,VN0001010101230001,"Bottled Water,Bottled Water"
1,VN0001010101230002,"Bottled Water,Bottled Water"
2,VN0001010101230003,"Packaged Ice Cream/Novelties,Potato Chips,Pack..."
3,VN0001010101230004,"Liquor,Cold Beverages,Add-on item Cat,Liquor,C..."
4,VN0001010101230005,"Sports Drinks,Sports Drinks,Sports Drinks,Spor..."
...,...,...
40123,VN0001023101230414,"Energy Drinks,Energy Drinks"
40124,VN0001023101230416,"Bottled Water,Bottled Water"
40125,VN0001023101230418,"Packaged Ice Cream/Novelties,Packaged Ice Crea..."
40126,VN0001023101230419,"Carbonated Drinks - CSD,Bottled Water,Carbonat..."


In [18]:
FPGApriori = fp(test_df, int(filter_value), float(min_support))

In [19]:
FPGApriori

Unnamed: 0,support,itemsets
0,0.163037,(Bottled Water)
1,0.072138,(Whole Milk)
2,0.051480,(Potato Chips)
3,0.045338,(Packaged Ice Cream/Novelties)
4,0.008264,(Grooming Aids)
...,...,...
191,0.005807,"(Import Beer, Domestic Beer)"
192,0.006588,"(Jelly, Chocolate)"
193,0.009604,"(Hot pot, Instant Noodles)"
194,0.005918,"(Hot pot, Other Cuisine)"
