In [1]:
import pandas as pd

ratings = pd.read_csv(
    "ratings.csv")

movies = pd.read_csv(
    "movies.csv")

In [2]:
# Chỉ giữ phim được đánh giá cao (rating ≥ 4)
ratings = ratings[ratings["rating"] >= 4]

In [3]:
# Gộp tên phim
data = ratings.merge(movies, on="movieId")

In [4]:
# Tạo danh sách
transactions = (
    data
    .groupby("userId")["title"]
    .apply(list)
    .tolist()
)

print(transactions[0])

['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)', 'Seven (a.k.a. Se7en) (1995)', 'Usual Suspects, The (1995)', 'Bottle Rocket (1996)', 'Braveheart (1995)', 'Rob Roy (1995)', 'Canadian Bacon (1995)', 'Desperado (1995)', 'Billy Madison (1995)', 'Dumb & Dumber (Dumb and Dumber) (1994)', 'Ed Wood (1994)', 'Star Wars: Episode IV - A New Hope (1977)', 'Tommy Boy (1995)', 'Clear and Present Danger (1994)', 'Forrest Gump (1994)', 'Jungle Book, The (1994)', 'Mask, The (1994)', 'Dazed and Confused (1993)', 'Fugitive, The (1993)', 'Jurassic Park (1993)', "Schindler's List (1993)", 'So I Married an Axe Murderer (1993)', 'Three Musketeers, The (1993)', 'Tombstone (1993)', 'Dances with Wolves (1990)', 'Batman (1989)', 'Silence of the Lambs, The (1991)', 'Pinocchio (1940)', 'Fargo (1996)', 'James and the Giant Peach (1996)', 'Rock, The (1996)', "She's the One (1996)", 'Wizard of Oz, The (1939)', 'Citizen Kane (1941)', 'Adventures of Robin Hood, The (1938)', 'Ghost and Mrs. Muir, The (194

In [5]:
transactions_sets = [set(t) for t in transactions]

In [6]:
from collections import Counter

item_counts = Counter()

for t in transactions_sets:
    for item in t:
        item_counts[item] += 1

N = len(transactions_sets)

# chỉ giữ phim xuất hiện ≥ 5% users
frequent_items = {
    item for item, cnt in item_counts.items()
    if cnt / N >= 0.05
}

In [8]:
from itertools import permutations

candidate_rules = [
    (frozenset([a]), b)
    for a, b in permutations(frequent_items, 2)
]

In [9]:
from collections import defaultdict

correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)

for reviews in transactions_sets:        # mỗi user
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        
        if premise.issubset(reviews):     # user thỏa X
            if conclusion in reviews:     # user có Y
                correct_counts[candidate_rule] += 1
            else:                          # user không có Y
                incorrect_counts[candidate_rule] += 1


In [13]:
rule_confidence = {
    rule: correct_counts[rule] /
          float(correct_counts[rule] + incorrect_counts[rule])
    for rule in candidate_rules
    if correct_counts[rule] + incorrect_counts[rule] > 0
}

In [14]:
strong_rules = {
    rule: conf
    for rule, conf in rule_confidence.items()
    if conf >= 0.9
}


In [15]:
for i, ((premise, conclusion), conf) in enumerate(strong_rules.items(), 1):
    premise_str = ', '.join(premise)
    print(f"Luật {i}: {{{premise_str}}} → {{{conclusion}}}")
    print(f"  Confidence: {conf:.3f}")
    print("-" * 60)


Luật 1: {Get Shorty (1995)} → {Pulp Fiction (1994)}
  Confidence: 0.911
------------------------------------------------------------
Luật 2: {Natural Born Killers (1994)} → {Pulp Fiction (1994)}
  Confidence: 0.914
------------------------------------------------------------
Luật 3: {In the Line of Fire (1993)} → {Fugitive, The (1993)}
  Confidence: 0.946
------------------------------------------------------------
Luật 4: {Shrek 2 (2004)} → {Shrek (2001)}
  Confidence: 0.909
------------------------------------------------------------
Luật 5: {Godfather: Part II, The (1974)} → {Godfather, The (1972)}
  Confidence: 0.953
------------------------------------------------------------
Luật 6: {Training Day (2001)} → {Matrix, The (1999)}
  Confidence: 0.903
------------------------------------------------------------
Luật 7: {Predator (1987)} → {Star Wars: Episode V - The Empire Strikes Back (1980)}
  Confidence: 0.909
------------------------------------------------------------
Luật 8: {Ro