In [4]:
# !pip install nltk
# !pip install mlxtend
import pandas as pd
import numpy as np
import nltk
from mlxtend.frequent_patterns import apriori, association_rules , fpgrowth
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

## 3.1 : Bạch Thảo

In [5]:
df = pd.read_csv('./dataset/customer_transactions_dataset.csv',usecols=['TransactionID', 'Sequence'], sep='^([^,]+),',engine='python')
df

Unnamed: 0,TransactionID,Sequence
0,1,"Apple, Banana, Cereal, Donut"
1,2,"Apple, Cereal, Donut"
2,3,"Banana, Donut"
3,4,"Apple, Donut"
4,5,"Apple, Banana, Cereal"
5,6,"Cereal, Donut"
6,7,"Apple, Banana, Donut"
7,8,"Apple, Banana"
8,9,"Banana, Cereal"
9,10,"Apple, Cereal, Donut"


##### Pattern discovery using Apriori Algorithm

In [6]:
customer_dummy = df['Sequence'].str.get_dummies(sep=', ')
customer_trans = pd.concat([df, customer_dummy], axis=1)
customer_trans

Unnamed: 0,TransactionID,Sequence,Apple,Banana,Cereal,Donut
0,1,"Apple, Banana, Cereal, Donut",1,1,1,1
1,2,"Apple, Cereal, Donut",1,0,1,1
2,3,"Banana, Donut",0,1,0,1
3,4,"Apple, Donut",1,0,0,1
4,5,"Apple, Banana, Cereal",1,1,1,0
5,6,"Cereal, Donut",0,0,1,1
6,7,"Apple, Banana, Donut",1,1,0,1
7,8,"Apple, Banana",1,1,0,0
8,9,"Banana, Cereal",0,1,1,0
9,10,"Apple, Cereal, Donut",1,0,1,1


In [7]:
frequent_itemsets = apriori(customer_dummy, min_support=0.3, use_colnames=True, )
frequent_itemsets.sort_values(by='support',ascending=False)



Unnamed: 0,support,itemsets
0,0.7,(Apple)
3,0.7,(Donut)
1,0.6,(Banana)
2,0.6,(Cereal)
6,0.5,"(Apple, Donut)"
4,0.4,"(Apple, Banana)"
5,0.4,"(Cereal, Apple)"
9,0.4,"(Cereal, Donut)"
7,0.3,"(Cereal, Banana)"
8,0.3,"(Banana, Donut)"


In [8]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(rules)

       antecedents      consequents  antecedent support  consequent support  \
0          (Apple)          (Donut)                 0.7                 0.7   
1          (Donut)          (Apple)                 0.7                 0.7   
2  (Cereal, Apple)          (Donut)                 0.4                 0.7   
3  (Cereal, Donut)          (Apple)                 0.4                 0.7   
4   (Apple, Donut)         (Cereal)                 0.5                 0.6   
5         (Cereal)   (Apple, Donut)                 0.6                 0.5   
6          (Apple)  (Cereal, Donut)                 0.7                 0.4   
7          (Donut)  (Cereal, Apple)                 0.7                 0.4   

   support  confidence      lift  leverage  conviction  zhangs_metric  
0      0.5    0.714286  1.020408      0.01        1.05       0.066667  
1      0.5    0.714286  1.020408      0.01        1.05       0.066667  
2      0.3    0.750000  1.071429      0.02        1.20       0.111111  


##### Sequential Patterns Using GSP

In [9]:
!pip install gsppy
from gsppy.gsp import GSP

sequences = df['Sequence'].apply(lambda x: x.split(', ')).tolist()
sequences

Collecting gsppy
  Downloading gsppy-1.1-py3-none-any.whl.metadata (3.1 kB)
Downloading gsppy-1.1-py3-none-any.whl (5.7 kB)
Installing collected packages: gsppy
Successfully installed gsppy-1.1


[['Apple', 'Banana', 'Cereal', 'Donut'],
 ['Apple', 'Cereal', 'Donut'],
 ['Banana', 'Donut'],
 ['Apple', 'Donut'],
 ['Apple', 'Banana', 'Cereal'],
 ['Cereal', 'Donut'],
 ['Apple', 'Banana', 'Donut'],
 ['Apple', 'Banana'],
 ['Banana', 'Cereal'],
 ['Apple', 'Cereal', 'Donut']]

In [10]:
result = GSP(sequences).search(0.2)
for i in result:
  print(i)


{('Banana',): 6, ('Apple',): 7, ('Donut',): 7, ('Cereal',): 6}
{('Apple', 'Cereal'): 2, ('Apple', 'Banana'): 4, ('Cereal', 'Donut'): 4, ('Banana', 'Cereal'): 3, ('Banana', 'Donut'): 2}
{('Apple', 'Cereal', 'Donut'): 2, ('Apple', 'Banana', 'Cereal'): 2}


## 3.2 : Quỳnh Hoa

In [11]:
file_path = './dataset/example_grocery_transaction_dataset.csv'
df2 = pd.read_csv(file_path,usecols=['TransactionID', 'Items'], sep='^([^,]+),')
df2

  df2 = pd.read_csv(file_path,usecols=['TransactionID', 'Items'], sep='^([^,]+),')


Unnamed: 0,TransactionID,Items
0,1,"Apple, Banana, Cereal"
1,2,"Apple, Banana, Milk"
2,3,"Apple, Banana, Cereal, Milk"
3,4,"Apple, Milk"
4,5,"Apple, Cereal"
5,6,"Banana, Cereal"
6,7,"Banana, Milk"
7,8,"Apple, Banana"
8,9,"Apple, Banana, Cereal, Milk"
9,10,"Apple, Banana, Cereal"


In [12]:
customer_dummy = df2['Items'].str.get_dummies(sep=', ')
customer_trans = pd.concat([df2, customer_dummy], axis=1)
customer_trans = customer_trans.drop(columns=['Items'])
customer_trans

Unnamed: 0,TransactionID,Apple,Banana,Cereal,Milk
0,1,1,1,1,0
1,2,1,1,0,1
2,3,1,1,1,1
3,4,1,0,0,1
4,5,1,0,1,0
5,6,0,1,1,0
6,7,0,1,0,1
7,8,1,1,0,0
8,9,1,1,1,1
9,10,1,1,1,0


In [13]:
frequent_itemsets = apriori(customer_dummy, min_support=0.3, use_colnames=True, )
frequent_itemsets.sort_values(by='support',ascending=False)



Unnamed: 0,support,itemsets
0,0.8,(Apple)
1,0.8,(Banana)
2,0.6,(Cereal)
4,0.6,"(Apple, Banana)"
3,0.5,(Milk)
5,0.5,"(Cereal, Apple)"
7,0.5,"(Cereal, Banana)"
6,0.4,"(Milk, Apple)"
8,0.4,"(Milk, Banana)"
9,0.4,"(Cereal, Apple, Banana)"


In [14]:
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.001)
print(rules)

       antecedents      consequents  antecedent support  consequent support  \
0         (Cereal)          (Apple)                 0.6                 0.8   
1          (Apple)         (Cereal)                 0.8                 0.6   
2         (Cereal)         (Banana)                 0.6                 0.8   
3         (Banana)         (Cereal)                 0.8                 0.6   
4  (Apple, Banana)         (Cereal)                 0.6                 0.6   
5         (Cereal)  (Apple, Banana)                 0.6                 0.6   

   support  confidence      lift  leverage  conviction  zhangs_metric  
0      0.5    0.833333  1.041667      0.02    1.200000           0.10  
1      0.5    0.625000  1.041667      0.02    1.066667           0.20  
2      0.5    0.833333  1.041667      0.02    1.200000           0.10  
3      0.5    0.625000  1.041667      0.02    1.066667           0.20  
4      0.4    0.666667  1.111111      0.04    1.200000           0.25  
5      0.4    

## 3.3 : Bá Triết

In [15]:
# !pip install pymining
import pandas as pd
import pymining
from pymining import seqmining

In [16]:
df = pd.read_csv('./dataset/example_sequenceid_dataset.csv',usecols=['SequenceID', 'Actions'], sep='^([^,]+),',engine='python')
df

Unnamed: 0,SequenceID,Actions
0,1,"A, B, C, D"
1,2,"A, C, D"
2,3,"B, D"
3,4,"A, D"
4,5,"A, B, C"
5,6,"C, D"
6,7,"A, B, D"
7,8,"A, B"
8,9,"B, C"
9,10,"A, C, D"


In [17]:
tmp = []
for i in df['Actions']:
    tmp.append(i)

data = [tuple(item.split(',')) for item in tmp]
data = list(map(lambda row: tuple(item.strip() for item in row), data))

data

[('A', 'B', 'C', 'D'),
 ('A', 'C', 'D'),
 ('B', 'D'),
 ('A', 'D'),
 ('A', 'B', 'C'),
 ('C', 'D'),
 ('A', 'B', 'D'),
 ('A', 'B'),
 ('B', 'C'),
 ('A', 'C', 'D')]

In [18]:
min_support = 2

result = seqmining.freq_seq_enum(data, min_support)

frequent_sequences = list(result)

for seq in frequent_sequences:
    print(f"Sequence: {seq[0]}, Support: {seq[1]}")

Sequence: ('B', 'C'), Support: 3
Sequence: ('A', 'B', 'C'), Support: 2
Sequence: ('A', 'C', 'D'), Support: 3
Sequence: ('A', 'D'), Support: 5
Sequence: ('D',), Support: 7
Sequence: ('C', 'D'), Support: 4
Sequence: ('A', 'B'), Support: 4
Sequence: ('A',), Support: 7
Sequence: ('A', 'C'), Support: 4
Sequence: ('C',), Support: 6
Sequence: ('B',), Support: 6
Sequence: ('B', 'D'), Support: 3
Sequence: ('A', 'B', 'D'), Support: 2


In [19]:
# !pip install prefixspan
from prefixspan import PrefixSpan

min_support = 2

ps = PrefixSpan(data)
frequent_sequences = ps.frequent(min_support)

for pattern, support in frequent_sequences:
    print(f"Pattern: {pattern}, Support: {support}")

Pattern: 7, Support: ['A']
Pattern: 4, Support: ['A', 'B']
Pattern: 2, Support: ['A', 'B', 'C']
Pattern: 2, Support: ['A', 'B', 'D']
Pattern: 4, Support: ['A', 'C']
Pattern: 3, Support: ['A', 'C', 'D']
Pattern: 5, Support: ['A', 'D']
Pattern: 6, Support: ['B']
Pattern: 3, Support: ['B', 'C']
Pattern: 3, Support: ['B', 'D']
Pattern: 6, Support: ['C']
Pattern: 4, Support: ['C', 'D']
Pattern: 7, Support: ['D']


## 3.4 : Vĩnh Lộc

In [21]:
# Điều chỉnh đường dẫn tới file trong Google Drive
file_path = './dataset/customer_transactions_dataset.csv'
df = pd.read_csv(file_path, usecols=['TransactionID', 'Sequence'], sep=',')
print(df)


   TransactionID Sequence
0              1    Apple
1              2    Apple
2              3   Banana
3              4    Apple
4              5    Apple
5              6   Cereal
6              7    Apple
7              8    Apple
8              9   Banana
9             10    Apple


In [23]:
# Download NLTK data
# nltk.download('punkt')

In [24]:
# Tải tập dữ liệu vào DataFrame
data = {
    'TransactionID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Sequence': [
        'Apple Banana Cereal Donut',
        'Apple Cereal Donut',
        'Banana Donut',
        'Apple Donut',
        'Apple Banana Cereal',
        'Cereal Donut',
        'Apple Banana Donut',
        'Apple Banana',
        'Banana Cereal',
        'Apple Cereal Donut'
    ]
}
df = pd.DataFrame(data)
# Token hóa các chuỗi
df['Tokens'] = df['Sequence'].apply(word_tokenize)

# Làm phẳng danh sách các chuỗi được mã hóa để phân tích bigram
tokens = [token for sublist in df['Tokens'] for token in sublist]

# Tạo công cụ tìm kiếm sắp xếp thứ tự bigram
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)

# Áp dụng bộ lọc tần số (tùy chọn, ở đây chỉ xem xét các bigram xảy ra nhiều lần)
finder.apply_freq_filter(2)

# Chấm điểm bigram bằng PMI
scored = finder.score_ngrams(bigram_measures.pmi)

In [25]:
# Đạt top 10 bigram có điểm PMI cao nhất
top_bigrams = sorted(scored, key=lambda x: -x[1])[:10]

# In các bigram hàng đầu
print("Top bigrams by PMI:")
for bigram, score in top_bigrams:
    print(f"{bigram}: {score}")

# Giải thích các bigram hàng đầu
print("Top bigrams with PMI scores:")
for bigram, score in top_bigrams:
    print(f"Phrase: {' '.join(bigram)}, PMI Score: {score}")

Top bigrams by PMI:
('Donut', 'Apple'): 1.407657968913246
('Apple', 'Banana'): 1.308122295362332
('Cereal', 'Donut'): 1.308122295362332
('Banana', 'Cereal'): 1.1154772174199366
('Apple', 'Cereal'): 0.30812229536233193
('Banana', 'Donut'): 0.30812229536233193
Top bigrams with PMI scores:
Phrase: Donut Apple, PMI Score: 1.407657968913246
Phrase: Apple Banana, PMI Score: 1.308122295362332
Phrase: Cereal Donut, PMI Score: 1.308122295362332
Phrase: Banana Cereal, PMI Score: 1.1154772174199366
Phrase: Apple Cereal, PMI Score: 0.30812229536233193
Phrase: Banana Donut, PMI Score: 0.30812229536233193
