<a href="https://colab.research.google.com/github/SeongBeomLEE/RecsysTutorial/blob/main/Pixie/Pixie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- board를 user로 표현
- pin을 movie로 표현
- user features의 경우 user의 시청한 전체 영화의 genre TF로 표현
- user features와 유사하다는 board와 pin은 user features의 cosine similarity로 표현
- 최근 유저 영화 시청 10개를 Q로 표현
- Q를 시간을 기준으로 1 / log2(rank + 1)를 계산해 wq를 표현
- 최근 유저 영화 시청 10개를 이용해 제일 최신에 볼 영화를 예측하는 방향으로 모델의 성능을 평가함

In [1]:
!pip install python-box

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
from box import Box
from tqdm import tqdm
from collections import defaultdict

import random
import numpy as np
import pandas as pd

import warnings

warnings.filterwarnings(action='ignore')

# 1. 학습 설정

In [3]:
config = {
    'data_path' : "/content/drive/MyDrive/RecsysTutorial/Data/MovieLens" , # 데이터 경로
}

config = Box(config)

# 2. 데이터 전처리

In [4]:
ratings = pd.read_csv(os.path.join(config.data_path, 'ratings.csv'))
ratings = ratings.sort_values(['userId', 'timestamp']).reset_index(drop = True)

movies = pd.read_csv(os.path.join(config.data_path, 'movies.csv'))

In [5]:
userId_to_idx = {}
idx_to_userId = {}

userId_list = ratings['userId'].unique()

for idx, userId in enumerate(userId_list):
    userId_to_idx[userId] = idx
    idx_to_userId[idx] = userId

movieId_to_idx = {}
idx_to_movieId = {}

movieId_list = movies['movieId'].unique()

for idx, movieId in enumerate(movieId_list):
    movieId_to_idx[movieId] = idx
    idx_to_movieId[idx] = movieId

In [6]:
ratings['userId_to_idx'] = ratings['userId'].apply(lambda x : userId_to_idx[x])
ratings['movieId_to_idx'] = ratings['movieId'].apply(lambda x : movieId_to_idx[x])

movies['movieId_to_idx'] = movies['movieId'].apply(lambda x : movieId_to_idx[x])

In [7]:
group_df = ratings.groupby('userId_to_idx')
train_set = {}
test_set = {}

for idx, g_df in group_df:
    total_list = g_df['movieId_to_idx'].tolist()

    train_set[idx] = total_list[:-1]
    test_set[idx] = [total_list[-1]]

In [8]:
movies['genres_list'] = movies['genres'].apply(lambda  x : x.split('|'))

In [9]:
total_genres = []

genres_list = movies['genres_list'].tolist()
for genres in genres_list:
    total_genres += genres

total_genres = list(set(total_genres))

genre_to_idx = {}

for idx, genre in enumerate(total_genres):
    genre_to_idx[genre] = idx

movies['pre_genres_list'] = movies['genres_list'].apply(lambda  x : [genre_to_idx[i] for i in x])

In [None]:
'''
board -> user
pin -> movie
'''

pins = [i for i in range(len(movieId_to_idx))]
boards = [i for i in range(len(userId_to_idx))]

In [None]:
pins_feature = np.zeros((len(movieId_to_idx), len(total_genres)))
boards_feature = np.zeros((len(userId_to_idx), len(total_genres)))

movies_index_df = movies.set_index('movieId_to_idx')

for pin in pins:
    pins_feature[pin, movies_index_df.loc[pin, 'pre_genres_list']] = 1

for board in boards:
    interaction = train_set[board]
    boards_feature[board, :] = pins_feature[interaction, :].sum(axis = 0)

# 3. Pixie

In [75]:
G = np.zeros((len(movieId_to_idx), len(userId_to_idx)))

for board in boards:
    interaction = train_set[board]
    G[interaction, board] = 1

C = max(G.sum(axis = 1))
N = 1000
a = 100

In [76]:
from sklearn.metrics.pairwise import cosine_similarity

def PersonalizedRandomSampling(E, U, Board = True):
    if Board:
        sim = cosine_similarity(U, boards_feature[E, :])
    else:
        sim = cosine_similarity(U, pins_feature[E, :])

    sim = sim.argsort()[::-1][:50][0]
    return np.random.choice(E[sim])

def RandomSampling(E):
    return np.random.choice(E)

def PixieRandomWalk(q : int, U = None, G : np.array = G, a : int = a, N : int = N):
    totSteps = 0
    V = np.zeros((G.shape[0]))

    while True:
        currPin = q
        currSteps = random.randrange(a + 1)
        for _ in range(currSteps):
            currBoard = np.where(G[currPin, :] == 1)[0]
            if isinstance(U, np.ndarray):
                currBoard = PersonalizedRandomSampling(currBoard, U, Board = True)
            else:
                currBoard = RandomSampling(currBoard)

            currPin = np.where(G[:, currBoard] == 1)[0]

            if isinstance(U, np.ndarray):
                currPin = PersonalizedRandomSampling(currPin, U, Board = False)
            else:
                currPin = RandomSampling(currPin)

            V[currPin] += 1

        totSteps += currSteps
        if totSteps >= N : break

    return V

def CarculatorS(q : int, C : int = C, G : np.array = G):
    s = G[q, :].sum() * (C - np.log2(G[q, :].sum()))
    return s

def PixieRandomWalkMultiple(Q : list, U = None, G : np.array = G, a : int = a, N : int = N):
    sum_s = sum([CarculatorS(q) for q in Q])
    V = np.zeros((G.shape[0]))
    Vq_list = []

    for idx, q in enumerate(Q):
        sq = CarculatorS(q)
        wq = 1 / np.log2(idx + 2)
        Nq = (wq * N * sq) / sum_s

        if isinstance(U, np.ndarray): Vq = PixieRandomWalk(q = q, U = U, G = G, a = a, N = Nq)
        else: Vq = PixieRandomWalk(q = q, G = G, a = a, N = Nq)
        V += np.sqrt(Vq)
    
    V = V**2

    return V

# 4. 성능 평가

In [21]:
'''
N = 1000
a = 100
'''

hit = 0
for board in tqdm(boards):
    interaction = train_set[board]
    Q = interaction[::-1][:10]
    V = PixieRandomWalkMultiple(Q = Q, U = None, G = G, a = a, N = N)

    rec = V.argsort()[::-1][:10].tolist()

    hit += len(set(rec) & set(test_set[board])) / len(test_set[board])

hit /= len(boards)

print(f"개인화 X : {hit}")

100%|██████████| 671/671 [03:09<00:00,  3.54it/s]

개인화 X : 0.013412816691505217





In [None]:
'''
N = 5000
a = 50
'''

hit = 0
for board in tqdm(boards):
    interaction = train_set[board]
    Q = interaction[::-1][:10]
    V = PixieRandomWalkMultiple(Q = Q, U = None, G = G, a = a, N = N)

    rec = V.argsort()[::-1][:10].tolist()

    hit += len(set(rec) & set(test_set[board])) / len(test_set[board])

hit /= len(boards)

print(f"개인화 X : {hit}")

100%|██████████| 671/671 [05:42<00:00,  1.96it/s]

개인화 X : 0.01788375558867362





In [49]:
'''
N = 1000
a = 100
'''

hit = 0
for board in tqdm(boards):
    interaction = train_set[board]
    Q = interaction[::-1][:10]
    U = boards_feature[[board], :]
    V = PixieRandomWalkMultiple(Q = Q, U = U, G = G, a = a, N = N)

    rec = V.argsort()[::-1][:10].tolist()

    hit += len(set(rec) & set(test_set[board])) / len(test_set[board])

hit /= len(boards)

print(f"개인화 O : {hit}")

100%|██████████| 671/671 [14:46<00:00,  1.32s/it]

개인화 O : 0.020864381520119227



