# BertRec
data/ml-1m/ml-1m.txt 기준으로 작성

In [1]:
import pandas as pd
import random

In [2]:
df = pd.read_csv("../data/ml-1m/ml-1m.txt", sep=" ")
df.columns = ["userId", "movieId"]
df

Unnamed: 0,userId,movieId
0,1,2
1,1,3
2,1,4
3,1,5
4,1,6
...,...,...
999605,6040,11
999606,6040,225
999607,6040,20
999608,6040,241


# 학습을 위해 유저별 영화별 ID 의 인덱스를 다시 수정

In [3]:
# i+2를 해주는 이유는 PAD, MASK 토큰의 자리
movieIdMapping = {k:i+2 for i, k in enumerate(df.movieId.unique())}
inverseMovieIdMapping = {v:k for k, v in movieIdMapping.items()}

df.movieId = df.movieId.map(movieIdMapping)

## 각 유저 별로 시청한 영화 목록을 그룹화

In [4]:
# 시각화용
grouped_df = df.groupby(by="userId").agg(list)
grouped_df

Unnamed: 0_level_0,movieId
userId,Unnamed: 1_level_1
1,"[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
2,"[80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 9..."
3,"[139, 140, 3, 141, 142, 143, 144, 145, 146, 14..."
4,"[251, 13, 252, 70, 253, 81, 238, 150, 254, 27,..."
5,"[262, 240, 263, 264, 150, 265, 39, 266, 267, 1..."
...,...
6036,"[168, 670, 146, 441, 230, 843, 184, 425, 280, ..."
6037,"[2426, 168, 479, 30, 45, 10, 41, 2270, 40, 153..."
6038,"[1564, 324, 645, 949, 251, 1098, 455, 1102, 16..."
6039,"[1388, 226, 1419, 1809, 143, 1633, 710, 151, 8..."


# 학습 데이터의 입력 예시

In [5]:
grouped_df = df.groupby(by="userId")
test_user_1 = grouped_df.get_group(5)
test_user_1

Unnamed: 0,userId,movieId
310,5,262
311,5,240
312,5,263
313,5,264
314,5,150
...,...,...
517,5,413
518,5,414
519,5,101
520,5,415


# 유저의 시퀀스를 랜덤하게 추출

In [6]:
PAD = 0
MASK = 1
# 시퀀스 길이
HISTORY = 120
VAL_HISTORY = 5

test_user_1.shape
end_index = random.randint(10, test_user_1.shape[0] - VAL_HISTORY)
start_index = max(0, end_index - HISTORY)

print(f"start_index = {start_index}")
print(f"end_index = {end_index}")

context = test_user_1[start_index:end_index]
context

start_index = 0
end_index = 41


Unnamed: 0,userId,movieId
310,5,262
311,5,240
312,5,263
313,5,264
314,5,150
315,5,265
316,5,39
317,5,266
318,5,267
319,5,147


# Bert를 학습하는 방법 중 하나인 masked language modeling 위해 마스킹 작업 및 패딩

In [7]:
trg = context.movieId.tolist()
masked_list = [movieId if random.random() < 0.8 else MASK for movieId in trg]
print(f"Before padding length : {len(masked_list)}")

if len(masked_list) < HISTORY:
    masked_integers = masked_list + [PAD] * (HISTORY - len(masked_list))
    
print(f"After padding length : {len(masked_integers)}")
print(f"input : {masked_integers}")

label = test_user_1.movieId[end_index:end_index + VAL_HISTORY].tolist()
print(f"label : {label}")

Before padding length : 41
After padding length : 120
input : [262, 240, 263, 264, 150, 1, 39, 266, 1, 147, 268, 1, 1, 1, 1, 271, 227, 94, 154, 1, 273, 1, 275, 276, 242, 277, 278, 189, 279, 280, 281, 282, 1, 284, 285, 286, 287, 152, 288, 289, 290, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
label : [291, 292, 40, 293, 294]
