# chapter5

In [1]:
# MovieLensのデータセットをdataディレクトリにダウンロードして展開
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

--2024-11-27 10:48:30--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... failed: Temporary failure in name resolution.
wget: unable to resolve host address ‘files.grouplens.org’
unzip:  cannot find or open ../data/ml-10m.zip, ../data/ml-10m.zip.zip or ../data/ml-10m.zip.ZIP.


In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
m_cols = ['movie_id', 'title', 'genre']
display(pd.read_csv('../data/ml-10M100K/movies.dat', sep='::', encoding='latin-1', engine='python'))
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::', encoding='latin-1', engine='python')
movies

FileNotFoundError: [Errno 2] No such file or directory: '../data/ml-10M100K/movies.dat'

In [None]:
movies.genre.apply(lambda x:x.split('|'))

In [None]:
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))
movies.head()

In [None]:
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('/app/data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')
user_tagged_movies

In [None]:
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()
user_tagged_movies

In [None]:
print(f'タグ種類={len(user_tagged_movies.tag.unique())}')
print(f'タグレコード数={len(user_tagged_movies)}')
print(f'タグがついている映画数={len(user_tagged_movies.movie_id.unique())}')

In [None]:
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})
movie_tags

In [None]:
movies = movies.merge(movie_tags, on='movie_id', how='left')
movies

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')
ratings.head()

In [None]:
len(ratings)

In [None]:
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings['user_id'].isin(valid_user_ids)]
ratings

In [None]:
movielens = ratings.merge(movies, on='movie_id')
movielens.head()

In [None]:
import numpy as np
movielens.groupby('user_id').agg({'movie_id':len}).agg({'movie_id':[min, max, np.mean, len]})

In [None]:
print(f'評価値数={len(movielens)}')

In [None]:
movielens.groupby('rating').agg({'movie_id':len})

In [None]:
movielens.groupby('user_id')['timestamp'].rank(ascending=False, method='first')

In [None]:
movielens['timestamp_rank'] = movielens.groupby('user_id')['timestamp'].rank(ascending=False, method='first')
movielens

In [None]:
# みてみる。
movielens.sort_values(['user_id'])

In [None]:
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank'] <= 5]

In [None]:
from typing import List, Dict
from sklearn.metrics import mean_squared_error

class MetricCalculator:
    def calc_rmse(self, true_rating: List[float], pred_rating: List[float]) -> float:
        return np.sqrt(mean_squared_error(true_rating, pred_rating))

    def calc_recall_at_k(self, 
                         true_user2items: Dict[int, List[int]],
                         pred_user2items: Dict[int, List[int]],
                         k: int)-> float:
        scores = []
        for user_id in true_user2items.keys():
            r_at_k = self._recall_at_k(true_user2items[user_id], pred_user2items[user_id], k)
            scores.append(r_at_k)
        return np.mean(scores)

    def _recall_at_k(self, true_items: List[int], pred_items: List[int], k: int) -> float:
        if len(true_items) == 0 or k == 0:
            return 0.0
        r_at_k = (len(set(true_items) & set(pred_items[:k]))) / len(true_items)
        return r_at_k
    
    def calc_precision_at_k(self,
                            true_user2items: Dict[int, List[int]],
                            pred_user2items: Dict[int, List[int]],
                            k: int
                            ) -> float:
        scores = []
        for user_id in true_user2items.keys():
            p_at_k = self._precision_at_k(true_user2items[user_id], pred_user2items[user_id], k)
            scores.append(p_at_k)
        return np.mean(scores)
    
    def _precision_at_k(self,
                        true_items: List[int],
                        pred_items: List[int],
                        k: int
                        ) -> float:
        if k == 0:
            return 0.0
        p_at_k = (len(set(true_items) & set(pred_items[:k]))) / k
        return p_at_k
    
        