# [ EXPLORATION ] 17. 다음에 볼 영화 예측하기

✅고객이 바로 지금 원하는 것이 무엇인지를 예측하여 추천하는 Session-bsed Recommendation 개념을 익히고 실제 모델을 구축해 본다.

---
### - 📖목차
* ✔️17-6. 프로젝트 - Movielens 영화 SBR
* ✔️회고 및 결론
* ✔️Reference(참고자료)

<br>



## ✔️17-6. 프로젝트 - Movielens 영화 SBR
---
### Step 1. 데이터의 전처리

In [1]:
# 모듈 로드
import tensorflow as tf
import os
from pathlib import Path
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 불러오기
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
!ls '/content/gdrive/MyDrive/aiffel/ex_17/yoochoose/data'

dataset-README.txt  ratings.dat  yoochoose-buys.dat
movies.dat	    README	 yoochoose-clicks.dat
precessed	    users.dat	 yoochoose-test.dat


In [7]:
data_path = '/content/gdrive/MyDrive/aiffel/ex_17/yoochoose/data'
train_path = '/content/gdrive/MyDrive/aiffel/ex_17/yoochoose/data/ratings.dat'

def load_data(data_path: Path, nrows=None):
    data = pd.read_csv(data_path, sep='::', header=None, usecols=[0, 1, 2, 3], dtype={0: np.int32, 1: np.int32, 2: np.int32}, nrows=nrows)
    data.columns = ['UserId', 'ItemId', 'Rating', 'Time']
    return data

data = load_data(train_path, None)
data.sort_values(['UserId', 'Time'], inplace=True)  # data를 id와 시간 순서로 정렬해줍니다.
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103
...,...,...,...,...
1000019,6040,2917,4,997454429
999988,6040,1921,4,997454464
1000172,6040,1784,3,997454464
1000167,6040,161,3,997454486


In [8]:
#Time 데이터(Second)를 Datetime으로 바꾸기
import datetime as dt
from datetime import date
from datetime import timedelta

start = '1970-01-01 00:00:00.000000'
start = dt.datetime.strptime(start, '%Y-%m-%d %H:%M:%S.%f') #start:1970-01-01 00:00:00

date = []
for delta in data['Time'] :
    date.append(start + timedelta(seconds = delta))
    
data['Time'] = date
data.head()

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,2000-12-31 22:00:19
22,1,1270,5,2000-12-31 22:00:55
27,1,1721,4,2000-12-31 22:00:55
37,1,1022,5,2000-12-31 22:00:55
24,1,2340,3,2000-12-31 22:01:43


In [9]:
#데이터의 전체적인 통계확인 
data.iloc[:, :-1].describe().T.sort_values(by='std' , ascending = False)\
                     .style.background_gradient(cmap='GnBu')\
                     .bar(subset=["max"], color='#F8766D')\
                     .bar(subset=["mean",], color='#00BFC4')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
UserId,1000209.0,3024.512348,1728.412695,1.0,1506.0,3070.0,4476.0,6040.0
ItemId,1000209.0,1865.539898,1096.040689,1.0,1030.0,1835.0,2770.0,3952.0
Rating,1000209.0,3.581564,1.117102,1.0,3.0,4.0,4.0,5.0


In [10]:
data['UserId'].nunique(), data['ItemId'].nunique() #유저수와 아이템수 확인 

(6040, 3706)

In [11]:
user_length = data.groupby('UserId').size() #유저 Id의 session length 확인
user_length #동일한 userId를 공유하는 데이터 row 개수 

UserId
1        53
2       129
3        51
4        21
5       198
       ... 
6036    888
6037    202
6038     20
6039    123
6040    341
Length: 6040, dtype: int64

In [12]:
print(user_length.describe().T)
print("=======================================================")
print("user_length median: {}, 99.9%: {}".format(user_length.median(), user_length.quantile(0.999)))   

count    6040.000000
mean      165.597517
std       192.747029
min        20.000000
25%        44.000000
50%        96.000000
75%       208.000000
max      2314.000000
dtype: float64
user_length median: 96.0, 99.9%: 1343.181000000005


In [13]:
#groupby를 이용하여 UserId와 Time을 기준으로 새로운 데이터 프레임을 생성
user_time = data.groupby(['UserId', 'Time'])['ItemId'].count().reset_index()
user_time.reset_index(inplace = True)
user_time.head()

Unnamed: 0,index,UserId,Time,ItemId
0,0,1,2000-12-31 22:00:19,1
1,1,1,2000-12-31 22:00:55,3
2,2,1,2000-12-31 22:01:43,1
3,3,1,2000-12-31 22:02:52,1
4,4,1,2000-12-31 22:04:35,1


In [14]:
#UserId와 Time을 기준으로 merge
new_data = pd.merge(data, user_time, on = ['UserId', 'Time'])
new_data

Unnamed: 0,UserId,ItemId_x,Rating,Time,index,ItemId_y
0,1,3186,4,2000-12-31 22:00:19,0,1
1,1,1270,5,2000-12-31 22:00:55,1,3
2,1,1721,4,2000-12-31 22:00:55,1,3
3,1,1022,5,2000-12-31 22:00:55,1,3
4,1,2340,3,2000-12-31 22:01:43,2,1
...,...,...,...,...,...,...
1000204,6040,2917,4,2001-08-10 14:40:29,471159,1
1000205,6040,1921,4,2001-08-10 14:41:04,471160,2
1000206,6040,1784,3,2001-08-10 14:41:04,471160,2
1000207,6040,161,3,2001-08-10 14:41:26,471161,1


In [15]:
#불필요한 칼럼 삭제
new_data.drop(columns = 'ItemId_y', inplace = True)

#칼럼 명 수정
new_data.rename(columns = {'ItemId_x' : 'ItemId'}, inplace = True)
new_data.rename(columns = {'index' : 'SessionId'}, inplace = True)

new_data

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId
0,1,3186,4,2000-12-31 22:00:19,0
1,1,1270,5,2000-12-31 22:00:55,1
2,1,1721,4,2000-12-31 22:00:55,1
3,1,1022,5,2000-12-31 22:00:55,1
4,1,2340,3,2000-12-31 22:01:43,2
...,...,...,...,...,...
1000204,6040,2917,4,2001-08-10 14:40:29,471159
1000205,6040,1921,4,2001-08-10 14:41:04,471160
1000206,6040,1784,3,2001-08-10 14:41:04,471160
1000207,6040,161,3,2001-08-10 14:41:26,471161


In [16]:
#session length 확인
session_length = new_data.groupby('SessionId').size()
print(len(session_length))

471163


In [17]:
print(session_length.describe().T)
print("=======================================================")
print("session_length median: {}, 99.9%: {}".format(session_length.median(), session_length.quantile(0.999)))   

count    471163.000000
mean          2.122851
std           1.546899
min           1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max          30.000000
dtype: float64
session_length median: 2.0, 99.9%: 10.0


In [18]:
#30개 영화 평가한 세션 확인 
long_session = session_length[session_length==30].index[0]
display(new_data[new_data['SessionId']==long_session])
new_data[new_data['SessionId']==long_session].shape

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId
112347,731,3044,4,2000-11-29 20:06:42,55117
112348,731,1455,3,2000-11-29 20:06:42,55117
112349,731,1639,5,2000-11-29 20:06:42,55117
112350,731,3244,4,2000-11-29 20:06:42,55117
112351,731,1656,2,2000-11-29 20:06:42,55117
112352,731,3426,4,2000-11-29 20:06:42,55117
112353,731,1829,2,2000-11-29 20:06:42,55117
112354,731,2675,4,2000-11-29 20:06:42,55117
112355,731,802,3,2000-11-29 20:06:42,55117
112356,731,803,5,2000-11-29 20:06:42,55117


(30, 5)

In [19]:
#이상치 제거
new_data = new_data.loc[new_data['SessionId'] != long_session]
new_data

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId
0,1,3186,4,2000-12-31 22:00:19,0
1,1,1270,5,2000-12-31 22:00:55,1
2,1,1721,4,2000-12-31 22:00:55,1
3,1,1022,5,2000-12-31 22:00:55,1
4,1,2340,3,2000-12-31 22:01:43,2
...,...,...,...,...,...
1000204,6040,2917,4,2001-08-10 14:40:29,471159
1000205,6040,1921,4,2001-08-10 14:41:04,471160
1000206,6040,1784,3,2001-08-10 14:41:04,471160
1000207,6040,161,3,2001-08-10 14:41:26,471161


In [20]:
new_data['Time'].min(), new_data['Time'].max()

(Timestamp('2000-04-25 23:05:32'), Timestamp('2003-02-28 17:49:50'))

In [21]:
time2000 = new_data[new_data['Time'] < dt.datetime(2001,1,1)]#2000년 데이터
time2001 = new_data[(new_data['Time'] > dt.datetime(2000,12,31)) & (new_data['Time'] < dt.datetime(2002,1,1))]#2001년 데이터 
time2002 = new_data[(new_data['Time'] >= dt.datetime(2002,1,1)) & (new_data['Time'] < dt.datetime(2003,1,1))]#2002년 데이터 
time2003 = new_data[new_data['Time'] > dt.datetime(2002,12,31)] #2003년 데이터 
print("2000년 데이터 개수: {}".format(time2000.shape[0]))
print("2001년 데이터 개수: {}".format(time2001.shape[0]))
print("2002년 데이터 개수: {}".format(time2002.shape[0]))
print("2003년 데이터 개수: {}".format(time2003.shape[0]))

2000년 데이터 개수: 904727
2001년 데이터 개수: 70230
2002년 데이터 개수: 24046
2003년 데이터 개수: 3369


In [22]:
#평점 3점 이상인 영화만 남기고 제거 
new_data = new_data[new_data['Rating'] >= 3]

In [23]:
def split_by_date(data: pd.DataFrame, n_days: int):
    final_time = data['Time'].max()
    session_last_time = data.groupby('SessionId')['Time'].max()
    session_in_train = session_last_time[session_last_time < final_time - dt.timedelta(n_days)].index
    session_in_test = session_last_time[session_last_time >= final_time - dt.timedelta(n_days)].index

    before_date = data[data['SessionId'].isin(session_in_train)]
    after_date = data[data['SessionId'].isin(session_in_test)]
    after_date = after_date[after_date['ItemId'].isin(before_date['ItemId'])]
    return before_date, after_date

In [24]:
#test dataset 분리
train, test = split_by_date(new_data, n_days = 100)
#validation dataset 분리
train, val = split_by_date(train, n_days = 365)

In [25]:
#new_data에 대한 정보.
def stats_info(data: pd.DataFrame, status: str):
    print(f'* {status} Set Stats Info\n'
          f'\t Events: {len(data)}\n'
          f'\t Sessions: {data["SessionId"].nunique()}\n'
          f'\t Items: {data["ItemId"].nunique()}\n'
          f'\t First Time : {data["Time"].min()}\n'
          f'\t Last Time : {data["Time"].max()}\n')

In [26]:
stats_info(train, 'train')
stats_info(val, 'valid')
stats_info(test, 'test')

* train Set Stats Info
	 Events: 810327
	 Sessions: 404871
	 Items: 3612
	 First Time : 2000-04-25 23:05:32
	 Last Time : 2001-11-20 05:13:09

* valid Set Stats Info
	 Events: 21991
	 Sessions: 15450
	 Items: 2820
	 First Time : 2001-11-20 19:04:49
	 Last Time : 2002-11-20 16:38:40

* test Set Stats Info
	 Events: 4118
	 Sessions: 3071
	 Items: 1625
	 First Time : 2002-11-20 20:30:02
	 Last Time : 2003-02-28 17:49:50



In [27]:
#train data를 기준으로 인덱싱.
id2idx = {item_id : index for index, item_id in enumerate(train['ItemId'].unique())}

def indexing(df, id2idx):
    df['item_idx'] = df['ItemId'].map(lambda x: id2idx.get(x, -1))  #id2idx에 없는 아이템은 모르는 값(-1) 처리.
    return df

train = indexing(train, id2idx)
val = indexing(val, id2idx)
test = indexing(test, id2idx)

In [30]:
#전처리 완료 된 데이터 저장.
#save_path = '/content/gdrive/MyDrive/aiffel/ex_17/yoochoose/data/precessed'

train.to_pickle('/content/gdrive/MyDrive/aiffel/ex_17/yoochoose/data/precessed/train.pkl')
val.to_pickle('/content/gdrive/MyDrive/aiffel/ex_17/yoochoose/data/precessed/valid.pkl')
test.to_pickle('/content/gdrive/MyDrive/aiffel/ex_17/yoochoose/data/precessed/test.pkl')

<br>

### Step 2. 미니 배치의 구성

In [31]:
#데이터가 주어지면 세션이 시작되는 인덱스를 담는 값과 세션을 새로 인덱싱한 값을 갖는 클래스
class SessionDataset:
    """Credit to yhs-968/pyGRU4REC."""

    def __init__(self, data):
        self.df = data
        self.click_offsets = self.get_click_offsets()
        self.session_idx = np.arange(self.df['SessionId'].nunique())  # indexing to SessionId

    def get_click_offsets(self):
        """
        Return the indexes of the first click of each session IDs,
        """
        offsets = np.zeros(self.df['SessionId'].nunique() + 1, dtype=np.int32)
        offsets[1:] = self.df.groupby('SessionId').size().cumsum()
        return offsets

In [32]:
#train데이터로 SessionDataset 객체를 만들기.
train_dataset = SessionDataset(train)
train_dataset.df.head(2)

Unnamed: 0,UserId,ItemId,Rating,Time,SessionId,item_idx
0,1,3186,4,2000-12-31 22:00:19,0,0
1,1,1270,5,2000-12-31 22:00:55,1,1


In [33]:
train_dataset.click_offsets #click_offsets : 각 세션이 시작된 인덱스 담고 있음.
train_dataset.session_idx #각 세션을 인덱싱한 np.array

array([     0,      1,      2, ..., 404868, 404869, 404870])

In [34]:
class SessionDataLoader:
    """Credit to yhs-968/pyGRU4REC."""

    def __init__(self, dataset: SessionDataset, batch_size=50):
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        start, end, mask, last_session, finished = self.initialize()  # initialize 메소드에서 확인
        """
        start : Index Where Session Start
        end : Index Where Session End
        mask : indicator for the sessions to be terminated
        """

        while not finished:
            min_len = (end - start).min() - 1  # Shortest Length Among Sessions
            for i in range(min_len):
                # Build inputs & targets
                inp = self.dataset.df['item_idx'].values[start + i]
                target = self.dataset.df['item_idx'].values[start + i + 1]
                yield inp, target, mask

            start, end, mask, last_session, finished = self.update_status(start, end, min_len, last_session, finished)

    def initialize(self):
        first_iters = np.arange(self.batch_size)    # 첫 배치에 사용할 세션 Index를 가져옴
        last_session = self.batch_size - 1    # 마지막으로 다루고 있는 세션 Index를 저장함
        start = self.dataset.click_offsets[self.dataset.session_idx[first_iters]]       # data 상에서 session이 시작된 위치를 가져오기
        end = self.dataset.click_offsets[self.dataset.session_idx[first_iters] + 1]  # session이 끝난 위치 바로 다음 위치를 가져오기
        mask = np.array([])   # session의 모든 아이템을 다 돌은 경우 mask에 추가해줄 것임
        finished = False         # data를 전부 돌았는지 기록하기 위한 변수
        return start, end, mask, last_session, finished

    def update_status(self, start: np.ndarray, end: np.ndarray, min_len: int, last_session: int, finished: bool):  
        # 다음 배치 데이터를 생성하기 위해 상태를 update함
        
        start += min_len   # __iter__에서 min_len 만큼 for문을 돌았으므로 start를 min_len 만큼 더해줌
        mask = np.arange(self.batch_size)[(end - start) == 1]  
        # end는 다음 세션이 시작되는 위치인데 start와 한 칸 차이난다는 것은 session이 끝났다는 뜻 mask에 기록

        for i, idx in enumerate(mask, start=1):  # mask에 추가된 세션 개수만큼 새로운 세션을 돌것
            new_session = last_session + i  
            if new_session > self.dataset.session_idx[-1]:  # 만약 새로운 세션이 마지막 세션 index보다 크다면 모든 학습데이터를 돈 것
                finished = True
                break
            # update the next starting/ending point
            start[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session]]     # 종료된 세션 대신 새로운 세션의 시작점을 기록
            end[idx] = self.dataset.click_offsets[self.dataset.session_idx[new_session] + 1]

        last_session += len(mask)  # 마지막 세션의 위치를 기록
        return start, end, mask, last_session, finished

In [35]:
train_data_loader = SessionDataLoader(train_dataset, batch_size=4)
iter_ex = iter(train_data_loader)
inputs, labels, mask =  next(iter_ex)
print(f'Model Input Item Idx are : {inputs}')
print(f'Label Item Idx are : {"":5} {labels}')
print(f'Previous Masked Input Idx are {mask}')

Model Input Item Idx are : [19  1  7  9]
Label Item Idx are :       [20  2  8 10]
Previous Masked Input Idx are [0]


<br>

### Step 3. 모델 구성

In [36]:
#Evaluation Metric

def mrr_k(pred, truth: int, k: int):
    indexing = np.where(pred[:k] == truth)[0]
    if len(indexing) > 0:
        return 1 / (indexing[0] + 1)
    else:
        return 0


def recall_k(pred, truth: int, k: int) -> int:
    answer = truth in pred[:k]
    return int(answer)

In [37]:
from tensorflow.keras.layers import Input, Dense, Dropout, GRU
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

In [38]:
def create_model(args):
    inputs = Input(batch_shape=(args.batch_size, 1, args.num_items))
    gru, _ = GRU(args.hsz, stateful=True, return_state=True, name='GRU')(inputs)
    dropout = Dropout(args.drop_rate)(gru)
    predictions = Dense(args.num_items, activation='softmax')(dropout)
    model = Model(inputs=inputs, outputs=[predictions])
    model.compile(loss=categorical_crossentropy, optimizer=RMSprop(args.lr), metrics=['accuracy'])
    model.summary()
    return model

In [39]:
#모델에 사용할 hyper-parameter를 class형식으로 관리

class Args:
    def __init__(self, train, val, test, batch_size, hsz, drop_rate, lr, epochs, k):
        self.train = train
        self.val = val
        self.test = test
        self.num_items = train['ItemId'].nunique()
        self.num_sessions = train['SessionId'].nunique()
        self.batch_size = batch_size
        self.hsz = hsz
        self.drop_rate = drop_rate
        self.lr = lr
        self.epochs = epochs
        self.k = k

#args = Args(train, val, test, batch_size=128, hsz=50, drop_rate=0.1, lr=0.001, epochs=15, k=20)
args = Args(train, val, test, batch_size=256, hsz=50, drop_rate=0.1, lr=0.0001, epochs=20, k=20)

In [40]:
model = create_model(args)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(256, 1, 3612)]          0         
                                                                 
 GRU (GRU)                   [(256, 50),               549600    
                              (256, 50)]                         
                                                                 
 dropout (Dropout)           (256, 50)                 0         
                                                                 
 dense (Dense)               (256, 3612)               184212    
                                                                 
Total params: 733,812
Trainable params: 733,812
Non-trainable params: 0
_________________________________________________________________


<br>

### Step 4. 모델 학습

In [41]:
#train 셋으로 학습하면서 valid 셋으로 검증.
def train_model(model, args):
    train_dataset = SessionDataset(args.train)
    train_loader = SessionDataLoader(train_dataset, batch_size=args.batch_size)

    for epoch in range(1, args.epochs + 1):
        total_step = len(args.train) - args.train['SessionId'].nunique()
        tr_loader = tqdm(train_loader, total=total_step // args.batch_size, desc='Train', mininterval=1)
        for feat, target, mask in tr_loader:
            reset_hidden_states(model, mask)  #종료된 session은 hidden_state를 초기화. 아래 메서드에서 확인할 수 있음.

            input_ohe = to_categorical(feat, num_classes=args.num_items)
            input_ohe = np.expand_dims(input_ohe, axis=1)
            target_ohe = to_categorical(target, num_classes=args.num_items)

            result = model.train_on_batch(input_ohe, target_ohe)
            tr_loader.set_postfix(train_loss=result[0], accuracy = result[1])

        val_recall, val_mrr = get_metrics(args.val, model, args, args.k)  #valid set에 대해 검증.

        print(f"\t - Recall@{args.k} epoch {epoch}: {val_recall:3f}")
        print(f"\t - MRR@{args.k}    epoch {epoch}: {val_mrr:3f}\n")

def reset_hidden_states(model, mask):
    gru_layer = model.get_layer(name='GRU')  #model에서 gru layer를 가져옴.
    hidden_states = gru_layer.states[0].numpy()  #gru_layer의 parameter를 가져옴.
    for elt in mask:  #mask된 인덱스 즉, 종료된 세션의 인덱스를 돌면서
        hidden_states[elt, :] = 0  #parameter를 초기화 함.
    gru_layer.reset_states(states=hidden_states)

#valid셋과 test셋을 평가하는 코드
def get_metrics(data, model, args, k: int): 
    
    #train과 거의 같지만 mrr, recall을 구하는 라인이 있음.
    dataset = SessionDataset(data)
    loader = SessionDataLoader(dataset, batch_size=args.batch_size)
    recall_list, mrr_list = [], []

    total_step = len(data) - data['SessionId'].nunique()
    for inputs, label, mask in tqdm(loader, total=total_step // args.batch_size, desc='Evaluation', mininterval=1):
        reset_hidden_states(model, mask)
        input_ohe = to_categorical(inputs, num_classes=args.num_items)
        input_ohe = np.expand_dims(input_ohe, axis=1)

        pred = model.predict(input_ohe, batch_size=args.batch_size)
        pred_arg = tf.argsort(pred, direction='DESCENDING')  #softmax 값이 큰 순서대로 sorting.

        length = len(inputs)
        recall_list.extend([recall_k(pred_arg[i], label[i], k) for i in range(length)])
        mrr_list.extend([mrr_k(pred_arg[i], label[i], k) for i in range(length)])

    recall, mrr = np.mean(recall_list), np.mean(mrr_list)
    return recall, mrr

In [42]:
train_model(model, args)

Train: 100%|█████████▉| 1582/1583 [00:44<00:00, 35.54it/s, accuracy=0, train_loss=7.44]
Evaluation:  96%|█████████▌| 24/25 [00:45<00:01,  1.89s/it]


	 - Recall@20 epoch 1: 0.047363
	 - MRR@20    epoch 1: 0.010224



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 35.15it/s, accuracy=0.00391, train_loss=7.41]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.84s/it]


	 - Recall@20 epoch 2: 0.048665
	 - MRR@20    epoch 2: 0.010061



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.68it/s, accuracy=0.00391, train_loss=7.39]
Evaluation:  96%|█████████▌| 24/25 [00:45<00:01,  1.88s/it]


	 - Recall@20 epoch 3: 0.051432
	 - MRR@20    epoch 3: 0.011651



Train: 100%|█████████▉| 1582/1583 [00:46<00:00, 34.30it/s, accuracy=0.00781, train_loss=7.32]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.86s/it]


	 - Recall@20 epoch 4: 0.052246
	 - MRR@20    epoch 4: 0.012865



Train: 100%|█████████▉| 1582/1583 [00:46<00:00, 34.11it/s, accuracy=0.00781, train_loss=7.26]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.86s/it]


	 - Recall@20 epoch 5: 0.055339
	 - MRR@20    epoch 5: 0.014217



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.63it/s, accuracy=0.00781, train_loss=7.24]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.81s/it]


	 - Recall@20 epoch 6: 0.060872
	 - MRR@20    epoch 6: 0.014538



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 35.01it/s, accuracy=0.00781, train_loss=7.21]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.81s/it]


	 - Recall@20 epoch 7: 0.066895
	 - MRR@20    epoch 7: 0.016191



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.86it/s, accuracy=0.0117, train_loss=7.18]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.81s/it]


	 - Recall@20 epoch 8: 0.071777
	 - MRR@20    epoch 8: 0.017551



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.79it/s, accuracy=0.0117, train_loss=7.13]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.79s/it]


	 - Recall@20 epoch 9: 0.078613
	 - MRR@20    epoch 9: 0.020194



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.76it/s, accuracy=0.0195, train_loss=7.08]
Evaluation:  96%|█████████▌| 24/25 [00:42<00:01,  1.79s/it]


	 - Recall@20 epoch 10: 0.084798
	 - MRR@20    epoch 10: 0.022841



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.98it/s, accuracy=0.0195, train_loss=7.04]
Evaluation:  96%|█████████▌| 24/25 [00:42<00:01,  1.76s/it]


	 - Recall@20 epoch 11: 0.090983
	 - MRR@20    epoch 11: 0.024002



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.75it/s, accuracy=0.0234, train_loss=7]
Evaluation:  96%|█████████▌| 24/25 [00:41<00:01,  1.74s/it]


	 - Recall@20 epoch 12: 0.095052
	 - MRR@20    epoch 12: 0.025220



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.98it/s, accuracy=0.0273, train_loss=6.96]
Evaluation:  96%|█████████▌| 24/25 [00:41<00:01,  1.74s/it]


	 - Recall@20 epoch 13: 0.101400
	 - MRR@20    epoch 13: 0.026976



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 35.05it/s, accuracy=0.0312, train_loss=6.93]
Evaluation:  96%|█████████▌| 24/25 [00:41<00:01,  1.75s/it]


	 - Recall@20 epoch 14: 0.107422
	 - MRR@20    epoch 14: 0.029049



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.86it/s, accuracy=0.0391, train_loss=6.88]
Evaluation:  96%|█████████▌| 24/25 [00:42<00:01,  1.75s/it]


	 - Recall@20 epoch 15: 0.113607
	 - MRR@20    epoch 15: 0.031074



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 34.71it/s, accuracy=0.043, train_loss=6.88]
Evaluation:  96%|█████████▌| 24/25 [00:41<00:01,  1.74s/it]


	 - Recall@20 epoch 16: 0.120443
	 - MRR@20    epoch 16: 0.033212



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 35.06it/s, accuracy=0.0352, train_loss=6.84]
Evaluation:  96%|█████████▌| 24/25 [00:41<00:01,  1.72s/it]


	 - Recall@20 epoch 17: 0.128092
	 - MRR@20    epoch 17: 0.036188



Train: 100%|█████████▉| 1582/1583 [00:44<00:00, 35.27it/s, accuracy=0.043, train_loss=6.81]
Evaluation:  96%|█████████▌| 24/25 [00:41<00:01,  1.72s/it]


	 - Recall@20 epoch 18: 0.136068
	 - MRR@20    epoch 18: 0.038852



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 35.02it/s, accuracy=0.0469, train_loss=6.79]
Evaluation:  96%|█████████▌| 24/25 [00:40<00:01,  1.70s/it]


	 - Recall@20 epoch 19: 0.142741
	 - MRR@20    epoch 19: 0.041397



Train: 100%|█████████▉| 1582/1583 [00:45<00:00, 35.06it/s, accuracy=0.0547, train_loss=6.78]
Evaluation:  96%|█████████▌| 24/25 [00:40<00:01,  1.69s/it]

	 - Recall@20 epoch 20: 0.145996
	 - MRR@20    epoch 20: 0.043744






<br>

### Step 5. 모델 테스트

In [43]:
def test_model(model, args, test):
    test_recall, test_mrr = get_metrics(test, model, args, 20)
    print(f"\t - Recall@{args.k}: {test_recall:3f}")
    print(f"\t - MRR@{args.k}: {test_mrr:3f}\n")

test_model(model, args, test)

Evaluation:  75%|███████▌  | 3/4 [00:05<00:01,  1.75s/it]

	 - Recall@20: 0.148438
	 - MRR@20: 0.054509






<br>

#### 5-1. 모델 구조 변경

In [44]:
# droupout, gru 층 추가

def reset_hidden_states(model, mask): 
    gru_layer = model.get_layer(name='GRU1')  # model에서 gru layer를 가져옵니다.
    hidden_states = gru_layer.states[0].numpy()  # gru_layer의 parameter를 가져옵니다.
    for elt in mask:  # mask된 인덱스 즉, 종료된 세션의 인덱스를 돌면서
        hidden_states[elt, :] = 0  # parameter를 초기화 합니다.
    gru_layer.reset_states(states=hidden_states)
    
    gru_layer = model.get_layer(name='GRU2')  # model에서 gru layer를 가져옵니다.
    hidden_states = gru_layer.states[0].numpy()  # gru_layer의 parameter를 가져옵니다.
    for elt in mask:  # mask된 인덱스 즉, 종료된 세션의 인덱스를 돌면서
        hidden_states[elt, :] = 0  # parameter를 초기화 합니다.
    gru_layer.reset_states(states=hidden_states)

In [45]:
def create_model2(args):
    inputs = Input(batch_shape=(args.batch_size, 1, args.num_items))
    gru, _ = GRU(args.hsz, stateful=True, return_state=True,return_sequences=True, name='GRU1')(inputs)
    dropout = Dropout(args.drop_rate)(gru)
    gru, _ = GRU(args.hsz, stateful=True, return_state=True, name='GRU2')(dropout)
    dropout = Dropout(args.drop_rate)(gru)    
    predictions = Dense(args.num_items, activation='softmax')(dropout)
    
    model = Model(inputs=inputs, outputs=[predictions])
    model.compile(loss=categorical_crossentropy, optimizer=Adam(args.lr), metrics=['accuracy'])
    model.summary()
    return model

In [46]:
model = create_model2(args)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(256, 1, 3612)]          0         
                                                                 
 GRU1 (GRU)                  [(256, 1, 50),            549600    
                              (256, 50)]                         
                                                                 
 dropout_1 (Dropout)         (256, 1, 50)              0         
                                                                 
 GRU2 (GRU)                  [(256, 50),               15300     
                              (256, 50)]                         
                                                                 
 dropout_2 (Dropout)         (256, 50)                 0         
                                                                 
 dense_1 (Dense)             (256, 3612)               1842

In [47]:
train_model(model, args)

Train: 100%|█████████▉| 1582/1583 [00:52<00:00, 30.38it/s, accuracy=0.00781, train_loss=7.41]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.85s/it]


	 - Recall@20 epoch 1: 0.046875
	 - MRR@20    epoch 1: 0.010178



Train: 100%|█████████▉| 1582/1583 [00:49<00:00, 32.12it/s, accuracy=0.00391, train_loss=7.39]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.81s/it]


	 - Recall@20 epoch 2: 0.046224
	 - MRR@20    epoch 2: 0.010275



Train: 100%|█████████▉| 1582/1583 [00:49<00:00, 32.02it/s, accuracy=0.00781, train_loss=7.39]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.82s/it]


	 - Recall@20 epoch 3: 0.047852
	 - MRR@20    epoch 3: 0.010356



Train: 100%|█████████▉| 1582/1583 [00:49<00:00, 32.15it/s, accuracy=0, train_loss=7.38]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.86s/it]


	 - Recall@20 epoch 4: 0.046224
	 - MRR@20    epoch 4: 0.010255



Train: 100%|█████████▉| 1582/1583 [00:49<00:00, 32.27it/s, accuracy=0.00391, train_loss=7.38]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.85s/it]


	 - Recall@20 epoch 5: 0.046875
	 - MRR@20    epoch 5: 0.010199



Train: 100%|█████████▉| 1582/1583 [00:49<00:00, 32.28it/s, accuracy=0.00391, train_loss=7.37]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.83s/it]


	 - Recall@20 epoch 6: 0.046875
	 - MRR@20    epoch 6: 0.010311



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.30it/s, accuracy=0.00391, train_loss=7.38]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.84s/it]


	 - Recall@20 epoch 7: 0.046061
	 - MRR@20    epoch 7: 0.010282



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.33it/s, accuracy=0.00391, train_loss=7.38]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.84s/it]


	 - Recall@20 epoch 8: 0.047852
	 - MRR@20    epoch 8: 0.010356



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.55it/s, accuracy=0.00391, train_loss=7.38]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.86s/it]


	 - Recall@20 epoch 9: 0.046061
	 - MRR@20    epoch 9: 0.010135



Train: 100%|█████████▉| 1582/1583 [00:49<00:00, 32.17it/s, accuracy=0.00391, train_loss=7.37]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.85s/it]


	 - Recall@20 epoch 10: 0.047852
	 - MRR@20    epoch 10: 0.010367



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.32it/s, accuracy=0.00391, train_loss=7.35]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.83s/it]


	 - Recall@20 epoch 11: 0.048014
	 - MRR@20    epoch 11: 0.010361



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.48it/s, accuracy=0.00391, train_loss=7.31]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.84s/it]


	 - Recall@20 epoch 12: 0.046875
	 - MRR@20    epoch 12: 0.010286



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.40it/s, accuracy=0.00781, train_loss=7.27]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.85s/it]


	 - Recall@20 epoch 13: 0.048828
	 - MRR@20    epoch 13: 0.011159



Train: 100%|█████████▉| 1582/1583 [00:49<00:00, 32.16it/s, accuracy=0.00781, train_loss=7.22]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.85s/it]


	 - Recall@20 epoch 14: 0.054525
	 - MRR@20    epoch 14: 0.012586



Train: 100%|█████████▉| 1582/1583 [00:49<00:00, 32.14it/s, accuracy=0.0117, train_loss=7.18]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.85s/it]


	 - Recall@20 epoch 15: 0.059570
	 - MRR@20    epoch 15: 0.014076



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.38it/s, accuracy=0.00781, train_loss=7.13]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.83s/it]


	 - Recall@20 epoch 16: 0.062663
	 - MRR@20    epoch 16: 0.014634



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.34it/s, accuracy=0.00781, train_loss=7.09]
Evaluation:  96%|█████████▌| 24/25 [00:44<00:01,  1.84s/it]


	 - Recall@20 epoch 17: 0.065755
	 - MRR@20    epoch 17: 0.015734



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.45it/s, accuracy=0.0117, train_loss=7.05]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.82s/it]


	 - Recall@20 epoch 18: 0.068685
	 - MRR@20    epoch 18: 0.016067



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.53it/s, accuracy=0.0117, train_loss=6.99]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.80s/it]


	 - Recall@20 epoch 19: 0.070801
	 - MRR@20    epoch 19: 0.016542



Train: 100%|█████████▉| 1582/1583 [00:48<00:00, 32.54it/s, accuracy=0.0156, train_loss=6.98]
Evaluation:  96%|█████████▌| 24/25 [00:43<00:01,  1.80s/it]

	 - Recall@20 epoch 20: 0.074219
	 - MRR@20    epoch 20: 0.017004






In [48]:
test_model(model, args, test)

Evaluation:  75%|███████▌  | 3/4 [00:05<00:01,  1.84s/it]

	 - Recall@20: 0.085938
	 - MRR@20: 0.021718






In [49]:
def reset_hidden_states(model, mask):
    gru_layer = model.get_layer(name='GRU')  # model에서 gru layer를 가져옵니다.
    hidden_states = gru_layer.states[0].numpy()  # gru_layer의 parameter를 가져옵니다.
    for elt in mask:  # mask된 인덱스 즉, 종료된 세션의 인덱스를 돌면서
        hidden_states[elt, :] = 0  # parameter를 초기화 합니다.
    gru_layer.reset_states(states=hidden_states)

In [53]:
def load_data(data_path: Path, nrows=None):
    data = pd.read_csv(data_path, sep='::', header=None, usecols=[0, 1, 2, 3], dtype={0: np.int32, 1: np.int32, 2: np.int32}, nrows=nrows)
    data.columns = ['UserId', 'ItemId', 'Rating', 'Time']
    return data

data = load_data(train_path, None)
data.sort_values(['UserId', 'Time'], inplace=True)  # data를 id와 시간 순서로 정렬해줍니다.
data

Unnamed: 0,UserId,ItemId,Rating,Time
31,1,3186,4,978300019
22,1,1270,5,978300055
27,1,1721,4,978300055
37,1,1022,5,978300055
24,1,2340,3,978300103
...,...,...,...,...
1000019,6040,2917,4,997454429
999988,6040,1921,4,997454464
1000172,6040,1784,3,997454464
1000167,6040,161,3,997454486


In [67]:
from datetime import datetime

times = data["Time"]
time_lst = []

for time in times: 
    temp_date = datetime.fromtimestamp(time) 
    time_lst.append(temp_date)
    
data["Time"] = time_lst
data

Unnamed: 0,SessionId,ItemId,Rating,Time
31,1,3186,4,2000-12-31 22:00:19
22,1,1270,5,2000-12-31 22:00:55
27,1,1721,4,2000-12-31 22:00:55
37,1,1022,5,2000-12-31 22:00:55
24,1,2340,3,2000-12-31 22:01:43
...,...,...,...,...
1000019,6040,2917,4,2001-08-10 14:40:29
999988,6040,1921,4,2001-08-10 14:41:04
1000172,6040,1784,3,2001-08-10 14:41:04
1000167,6040,161,3,2001-08-10 14:41:26


In [68]:
oldest, latest = data['Time'].min(), data['Time'].max()
print(oldest) 
print(latest)

2000-04-25 23:05:32
2003-02-28 17:49:50


In [69]:
# short_session을 제거한 다음 unpopular item을 제거하면 다시 길이가 1인 session이 생길 수 있습니다.
# 이를 위해 반복문을 통해 지속적으로 제거 합니다.
def cleanse_recursive(data: pd.DataFrame, shortest, least_click) -> pd.DataFrame:
    while True:
        before_len = len(data)
        data = cleanse_short_session(data, shortest)
        data = cleanse_unpopular_item(data, least_click)
        after_len = len(data)
        if before_len == after_len:
            break
    return data


def cleanse_short_session(data: pd.DataFrame, shortest):
    session_len = data.groupby('SessionId').size()
    session_use = session_len[session_len >= shortest].index
    data = data[data['SessionId'].isin(session_use)]
    return data


def cleanse_unpopular_item(data: pd.DataFrame, least_click):
    item_popular = data.groupby('ItemId').size()
    item_use = item_popular[item_popular >= least_click].index
    data = data[data['ItemId'].isin(item_use)]
    return data

In [70]:
data_user = cleanse_recursive(data, shortest=2, least_click=5)
data_user

Unnamed: 0,SessionId,ItemId,Rating,Time
31,1,3186,4,2000-12-31 22:00:19
22,1,1270,5,2000-12-31 22:00:55
27,1,1721,4,2000-12-31 22:00:55
37,1,1022,5,2000-12-31 22:00:55
24,1,2340,3,2000-12-31 22:01:43
...,...,...,...,...
1000019,6040,2917,4,2001-08-10 14:40:29
999988,6040,1921,4,2001-08-10 14:41:04
1000172,6040,1784,3,2001-08-10 14:41:04
1000167,6040,161,3,2001-08-10 14:41:26


In [71]:
data.columns = ['SessionId', 'ItemId', 'Rating', 'Time']
data

Unnamed: 0,SessionId,ItemId,Rating,Time
31,1,3186,4,2000-12-31 22:00:19
22,1,1270,5,2000-12-31 22:00:55
27,1,1721,4,2000-12-31 22:00:55
37,1,1022,5,2000-12-31 22:00:55
24,1,2340,3,2000-12-31 22:01:43
...,...,...,...,...
1000019,6040,2917,4,2001-08-10 14:40:29
999988,6040,1921,4,2001-08-10 14:41:04
1000172,6040,1784,3,2001-08-10 14:41:04
1000167,6040,161,3,2001-08-10 14:41:26


In [72]:
session_length = data.groupby('SessionId').size()
session_length

SessionId
1        53
2       129
3        51
4        21
5       198
       ... 
6036    888
6037    202
6038     20
6039    123
6040    341
Length: 6040, dtype: int64

In [73]:
oldest, latest = data['Time'].min(), data['Time'].max()
print(oldest) 
print(latest)

2000-04-25 23:05:32
2003-02-28 17:49:50


In [74]:
data_user = data_user.dropna(axis=0)
data_user

Unnamed: 0,SessionId,ItemId,Rating,Time
31,1,3186,4,2000-12-31 22:00:19
22,1,1270,5,2000-12-31 22:00:55
27,1,1721,4,2000-12-31 22:00:55
37,1,1022,5,2000-12-31 22:00:55
24,1,2340,3,2000-12-31 22:01:43
...,...,...,...,...
1000019,6040,2917,4,2001-08-10 14:40:29
999988,6040,1921,4,2001-08-10 14:41:04
1000172,6040,1784,3,2001-08-10 14:41:04
1000167,6040,161,3,2001-08-10 14:41:26


In [75]:
def split_by_date(data: pd.DataFrame, n_days: int):
    final_time = data['Time'].max()
    session_last_time = data.groupby('SessionId')['Time'].max()
    session_in_train = session_last_time[session_last_time < final_time - dt.timedelta(n_days)].index
    session_in_test = session_last_time[session_last_time >= final_time - dt.timedelta(n_days)].index

    before_date = data[data['SessionId'].isin(session_in_train)]
    after_date = data[data['SessionId'].isin(session_in_test)]
    after_date = after_date[after_date['ItemId'].isin(before_date['ItemId'])]
    return before_date, after_date

In [76]:
tr, test = split_by_date(data_user, n_days=30)
tr, val = split_by_date(tr, n_days=30)

In [77]:
# data에 대한 정보를 살펴봅니다.
def stats_info(data: pd.DataFrame, status: str):
    print(f'* {status} Set Stats Info\n'
          f'\t Events: {len(data)}\n'
          f'\t Sessions: {data["SessionId"].nunique()}\n'
          f'\t Items: {data["ItemId"].nunique()}\n'
          f'\t First Time : {data["Time"].min()}\n'
          f'\t Last Time : {data["Time"].max()}\n')

In [78]:
stats_info(tr, 'train')
stats_info(val, 'valid')
stats_info(test, 'test')

* train Set Stats Info
	 Events: 919209
	 Sessions: 5858
	 Items: 3416
	 First Time : 2000-04-25 23:05:32
	 Last Time : 2002-12-30 02:26:14

* valid Set Stats Info
	 Events: 29477
	 Sessions: 79
	 Items: 2960
	 First Time : 2000-05-05 17:20:21
	 Last Time : 2003-01-29 03:00:40

* test Set Stats Info
	 Events: 50925
	 Sessions: 103
	 Items: 3172
	 First Time : 2000-05-01 11:15:13
	 Last Time : 2003-02-28 17:49:50



In [None]:
def reset_hidden_states(model, mask): # hsz 값과 drop_rate 값을 변경
    gru_layer = model.get_layer(name='GRU')  # model에서 gru layer를 가져옵니다.
    hidden_states = gru_layer.states[0].numpy()  # gru_layer의 parameter를 가져옵니다.
    for elt in mask:  # mask된 인덱스 즉, 종료된 세션의 인덱스를 돌면서
        hidden_states[elt, :] = 0  # parameter를 초기화 합니다.
    gru_layer.reset_states(states=hidden_states)

In [None]:
args = Args(tr, val, test, batch_size=64, hsz=30, drop_rate=0.2, lr=0.001, epochs=3, k=20)

In [None]:
model = create_model(args)

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(64, 1, 3405)]           0         
_________________________________________________________________
GRU (GRU)                    [(64, 30), (64, 30)]      309330    
_________________________________________________________________
dropout_8 (Dropout)          (64, 30)                  0         
_________________________________________________________________
dense_5 (Dense)              (64, 3405)                105555    
Total params: 414,885
Trainable params: 414,885
Non-trainable params: 0
_________________________________________________________________


In [None]:
# 학습
train_model(model, args)

Train:  99%|█████████▉| 15068/15172 [01:58<00:00, 127.61it/s, accuracy=0.0156, train_loss=6.15]
Evaluation:  15%|█▌        | 5/33 [00:01<00:06,  4.02it/s]
Train:   0%|          | 0/15172 [00:00<?, ?it/s, accuracy=0.0156, train_loss=6.31]

	 - Recall@20 epoch 1: 0.096875
	 - MRR@20    epoch 1: 0.028244



Train:  99%|█████████▉| 15068/15172 [02:10<00:00, 115.40it/s, accuracy=0.0156, train_loss=5.76]
Evaluation:  15%|█▌        | 5/33 [00:01<00:05,  4.85it/s]
Train:   0%|          | 0/15172 [00:00<?, ?it/s, accuracy=0.0469, train_loss=5.81]

	 - Recall@20 epoch 2: 0.118750
	 - MRR@20    epoch 2: 0.041895



Train:  99%|█████████▉| 15068/15172 [01:52<00:00, 134.44it/s, accuracy=0.0469, train_loss=5.77]
Evaluation:  15%|█▌        | 5/33 [00:01<00:05,  4.97it/s]

	 - Recall@20 epoch 3: 0.131250
	 - MRR@20    epoch 3: 0.051495






In [None]:
# 테스트
test_model(model, args, test)

Evaluation:   6%|▌         | 2/33 [00:00<00:07,  4.41it/s]

	 - Recall@20: 0.062500
	 - MRR@20: 0.005799






## ✔️회고 및 결론
---

#### ▶️ 이번 프로젝트에서 어려웠던 점

에포크나 드롭아웃, 레이어가 추가될때마다 Recall@k 지표와 MRR@k 값이 어떻게 변화하는지를 제대로 확인하고 싶었는데, 시간상 몇번만으로 끝낼 수 밖에 없었다는 점이 개인적으로는 아쉬웠다. 심지어 코랩 TPU에서 한번 학습하는데에 기본 30분이 걸리는데, 이걸 여덜,ㅁ 아홉번 돌린다면 기다리는데만 해도 한세월이겠구나 싶었다.

<br>

#### ▶️ 프로젝트를 진행하면서 알아낸 점 혹은 아직 모호한 점

recall@k는 전체 relevant한 아이템 중 추천된 아이템이 속한 비율인데, 전체 relevant한 아이템의 개수에서 분자로 k개의 추천 중 relevant한 아이템의 개수를 쓴다고 나와있어서 이렇게만 보기에는 개념을 확실히 인지하기 힘들었던 것 같다. 

이번에 새롭게 알게 된 MRR은 Mean Reciprocal Rank로, Reciprocal rank는 첫 번째로 등장하는 relevant한 아이템이 우리의 추천상 몇 번째에 위치하는지를 나타내는 지표라고 되어있었는데 그대로 쓰는게 아니라 역수를 취한다는게 특이했다.  @ 뒤에 붙은 k가 추천된 개수라는 건 이번에 처음 알았다......



<br>


#### ▶️ 루브릭 평가 지표를 맞추기 위해 시도한 것들

이번 프로젝트에서 루브릭의 기준은 아래와 같았다.

|       |                                    **< 평가문항 >**                                   |                                                              **< 상세기준 >**                                                             |
|-------|:---------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------|
| **1** | Movielens 데이터셋을 session based recommendation 관점으로 전처리하는 과정이 체계적으로 진행되었다.                                 | 데이터셋의 면밀한 분석을 토대로 세션단위 정의 과정(길이분석, 시간분석)을 합리적으로 수행한 과정이 기술되었다.<br>                                                        |
| **2** | RNN 기반의 예측 모델이 정상적으로 구성되어 안정적으로 훈련이 진행되었다. | 적절한 epoch만큼의 학습이 진행되는 과정에서 train loss가 안정적으로 감소하고, validation 단계에서의 Recall, MRR이 개선되는 것이 확인된다.                               |
| **3** | 세션정의, 모델구조, 하이퍼파라미터 등을 변경해서 실험하여 Recall, MRR 등의 변화추이를 관찰하였다.                                    | 3가지 이상의 변화를 시도하고 그 실험결과를 체계적으로 분석하였다. |


1. 기본으로 쓰였던 모델

![20220703221835](https://user-images.githubusercontent.com/100528803/177041771-bf4bb65b-a02b-45b3-a1cf-2aac02043d26.png)

2.  드롭아웃과 gru를 하나씩 더 추가한 모델

![20220703221845](https://user-images.githubusercontent.com/100528803/177041772-bc13f984-d996-41a6-8a0b-16c0bee65a80.png)

3. 하이퍼 파라미터(hsz, drop_rate)가 변경된 모델

![20220703221857](https://user-images.githubusercontent.com/100528803/177041773-46e397ec-ce65-450f-80d9-c586f9151e14.png)

제일 신경쓰였던건 훈련 게이지... 마지막에는 거의 차오르지 않는다는 점에서 계속 신경이 쓰였는데, 반대로 리콜이나 MRR 값들은 꾸준하게 낮아졌다는 점에서 작은 의의를 갖기로 했다. 게이지 같은 경우는 배치 사이즈를 바꾸면 된다는 조언도 들었는데, 거기까지 테스트하지못한 건 조금 아쉽다. 드롭아웃을 최근에 안쓰는 경향이 됬다는 말을 한번 들었었는데, 오히려 지금 보니까 의외로 성능이 괜찮아서 나라면 좀 더 써도 될 것 같다는 생각이 꾸준하게 들게 만들었다.

<br>

#### ▶️ 자기 다짐

session baed recommendation이라는 걸 이번에 처음 써보긴했는데, 제대로 개념을 잡으려면 아무래도 몇번은 더 돌려봐야 감을 잡을 것 같다. 지금 이대로 보기에는 앞서 행렬 인수분해로 썼던 거랑 조금 개념이 애매할 수도 있겠다는 생각이 들었다. 과정들을 겪으면서 loss값이 잘떨어지는 건 뿌듯했는데, 아무래도 학습 횟수나 배치사이즈를 좀 더 능숙하게 다룬다면 좋은 모델성능을 기대할 수도 있을 것 같다. 다만 추천 시스템은 아직 나한테는 생소하면서도 어려운 주제다. 하지만 충분히 마케팅 쪽에서는 할만한 가치가 있는 시스템 같다....

<br>

## ✔️Reference(참고자료)
---
* https://zzaebok.github.io/recommender_system/metrics/rec_metrics/
* https://abluesnake.tistory.com/98
* https://meissa.tistory.com/m/66?category=938854
* https://www.kaggle.com/datasets/chadgostopp/recsys-challenge-2015