In [1]:
import os
import sys
from typing import List, Dict
from pathlib import Path

sys.path.append(str(Path(__name__).resolve().parents[1]))

import pandas as pd
import numpy as np


from src.loader.movielens import MovieLensLoader
from src.utils.metrics import RecSysMetrics
from collections import defaultdict
import warnings
warnings.filterwarnings("ignore")



In [2]:
movielens_loader = MovieLensLoader(
    num_users=100,
    num_test_items=5,
)

In [3]:
movielens_dataset = movielens_loader.load()

In [4]:
train = movielens_dataset.train
test = movielens_dataset.test
rank_test = movielens_dataset.test_user2item

In [5]:
user_movie_matrix = train.pivot(index="user_id", columns="movie_id", values="rating")
user_movie_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1225,1226,1227,1228,1229,1231,1233,1234,1235,1238
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,5.0,,,,,,,5.0,,,...,,,,,,,,,,
97,4.0,,,,,,5.0,,,,...,,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,
99,4.0,,3.0,5.0,,,4.0,,,,...,,,,,,,,,,


In [6]:
user_movie_matrix[user_movie_matrix < 4] = 0
user_movie_matrix[user_movie_matrix.isnull()] = 0
user_movie_matrix[user_movie_matrix >= 4] = 1

In [7]:
user_movie_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1225,1226,1227,1228,1229,1231,1233,1234,1235,1238
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from mlxtend.frequent_patterns import apriori

freq_movies = apriori(
    user_movie_matrix, min_support=0.1, use_colnames=True)
freq_movies




Unnamed: 0,support,itemsets
0,0.36,(1)
1,0.13,(4)
2,0.41,(7)
3,0.24,(8)
4,0.31,(9)
...,...,...
133954,0.10,"(64, 98, 357, 134, 135, 9, 174, 180, 56, 127, ..."
133955,0.10,"(64, 98, 100, 135, 168, 12, 174, 50, 182, 56, ..."
133956,0.10,"(64, 98, 100, 135, 168, 12, 174, 238, 50, 182,..."
133957,0.10,"(64, 98, 100, 199, 168, 135, 12, 174, 182, 56,..."


In [10]:
freq_movies.sort_values("support", ascending=False).head()

Unnamed: 0,support,itemsets
19,0.56,(50)
41,0.5,(100)
70,0.45,(174)
2,0.41,(7)
77,0.41,(181)


In [37]:
from mlxtend.frequent_patterns import association_rules


rules = association_rules(freq_movies, metric="lift", 
                          min_threshold=1)
rules.sort_values("lift", ascending=False).head()[["antecedents", "consequents", "lift"]]

Unnamed: 0,antecedents,consequents,lift
6493878,"(98, 100, 357, 134, 174)","(191, 180, 12, 127)",10.0
4802322,"(474, 98, 435, 174)","(180, 211, 132)",10.0
6614790,"(100, 135, 12, 238, 174)","(98, 168, 9, 50, 182)",10.0
5465488,"(496, 197, 134)","(480, 357, 9, 174, 479)",10.0
5424973,"(483, 357, 134, 56, 474)","(9, 180, 132)",10.0


In [38]:
pred_user2items = defaultdict(list)

user_evaluated_movies = train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()

In [39]:
movielens_train_high_rating = train[train.rating >= 4]

In [41]:
from collections import defaultdict, Counter

for user_id, data in movielens_train_high_rating.groupby("user_id"):
    
    input_data = data.sort_values("timestamp")["movie_id"].tolist()[-5:]
    print(f"user_id - {user_id}, latest rated movies {input_data}")
    # 조건부에 하나라도 포함되는 연관 규칙 검출
    matched_flags = rules.antecedents.apply(lambda x: len(set(input_data) & x)) >= 1
    # 귀결부의 영화를 리스트에 저장, 등록 빈도 수로 정렬하ㅐ서 사용자가 아직 평가하지 않은 경우 추천 목록에 추가
    consequent_movies = []
    print(rules.shape)
    for i, row in rules[matched_flags].sort_values("lift", ascending=False).iterrows():
        consequent_movies.extend(row["consequents"])
        counter = Counter(consequent_movies)
        
        for movie_id, movie_cnt in counter.most_common():
            if movie_id not in user_evaluated_movies[user_id]:
                pred_user2items[user_id].append(movie_id)
            
            if len(pred_user2items[user_id]) == 10:
                break

user_id - 1, latest rated movies [270, 209, 32, 242, 111]
(6849374, 10)


KeyboardInterrupt: 

In [11]:


class AssociationModel(object):
    def __init__(self, movielens_train: pd.DataFrame, **kwargs) -> None:
        self.min_support = kwargs.get("min_support", 0.1)
        self.min_threshold = kwargs.get("min_threshold", 0.1)
        
        self.user_evaluated_movies = None
        self.movielens_train_high_rating = None
        self.user_movie_matrix = None
        self._initialize(movielens_train)
        
    def _initialize(self, movielens_train:pd.DataFrame):
        self.user_evaluated_movies = movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()
        self.movielens_train_high_rating = movielens_train[movielens_train.rating >= 4]
        self.user_movie_matrix = movielens_train.pivot(index="user_id", columns = "movie_id", values="rating")
        self.user_movie_matrix[self.user_movie_matrix < 4] = 0
        self.user_movie_matrix[self.user_movie_matrix.isnull()] = 0
        self.user_movie_matrix[self.user_movie_matrix >= 4] = 1
        
    def _get_association_rules(self) -> pd.DataFrame:
        freq_movies = apriori(self.user_movie_matrix, min_support=self.min_support, use_colnames=True)
        rules = association_rules(freq_movies, metric="lift", min_threshold=self.min_threshold)
        return rules
        
    def predict(self, top_k:int = 10):
        pred_user2items = defaultdict(list)
        rules = self._get_association_rules()
        for user_id, data in self.movielens_train_high_rating.groupby("user_id"):
            print(user_id)
            input_data = data.sort_values("timestamp")["movie_id"].tolist()[-5:]
            
            matched_flags = rules.antecedents.apply(lambda x: len(set(input_data) & x)) >= 1
            
            consequent_movies = []
            for i, row in rules[matched_flags].sort_values("lift", ascending=False).iterrows():
                consequent_movies.extend(row["consequents"])
            
            counter = Counter(consequent_movies)
            
            for movie_id, movie_cnt in counter.most_common():
                if movie_id not in self.user_evaluated_movies[user_id]:
                    pred_user2items[user_id].append(movie_id)
                
                if len(pred_user2items[user_id]) == 10:
                    break
                
        return pred_user2items


In [12]:
association_model = AssociationModel(movielens_train=train, min_support=0.1, min_threshold=0.1)

In [None]:
pred_ranking = association_model.predict(top_k= 10)

In [None]:
# print("Test MAE rating", RecSysMetrics().mae(test["rating"], pred_ratings))
# print("Test MSE rating", RecSysMetrics().mse(test["rating"], pred_ratings))
# print("Test RMSE rating", RecSysMetrics().rmse(test["rating"], pred_ratings))


print(
    "Test Precision@k",
    RecSysMetrics().calc_precision_at_k(
        moivelens_dataset.test_user2item, pred_ranking, 10
    ),
)

print(
    "Test Recall@k",
    RecSysMetrics().calc_recall_at_k(
        moivelens_dataset.test_user2item, pred_ranking, 10
    ),
)