In [1]:
import numpy as np
import pandas as pd
import random
from process_data import pre_process
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn
from torch.nn.init import xavier_uniform_, xavier_normal_




In [2]:
class GRU4REC(nn.Module):
    def __init__(self, input_size, output_size, embedding_size, hidden_size, n_layers=1, dp=0.3):
        """
        input_size = output_size = num_item
        """
        super(GRU4REC, self).__init__()

        # === 定义参数 ===
        self.input_size = input_size
        self.output_size = output_size
        self.embedding_size = embedding_size
        self.dropout = dp
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        # === 定义layers ===
        # 1. embedding layer
        self.item_embedding = nn.Embedding(self.input_size, self.embedding_size, padding_idx=0)
        self.emb_dropout = nn.Dropout(self.dropout)
        # 2. GRU layer
        self.gru = nn.GRU(input_size=self.embedding_size,
                          hidden_size=self.hidden_size,
                          num_layers=self.n_layers,
                          bias=False,
                          batch_first=True)
        # 3. feedforward layers
        self.feedforward = nn.Linear(self.hidden_size, self.embedding_size)

        # === 初始化参数 ===
        xavier_normal_(self.item_embedding.weight)
        xavier_uniform_(self.gru.weight_hh_l0)
        xavier_uniform_(self.gru.weight_ih_l0)

    def forward(self, item_seq, item_seq_len):
        """
        Params:
            item_seq, shape = (batch_size, seq_len)
        Return:
            scores, shape = (batch_size*seq_len, output_size)
        """

        # embedding layer
        seq_embedding = self.item_embedding(item_seq)
        gru_input = self.emb_dropout(seq_embedding)

        # GRU layer
        gru_output, _ = self.gru(gru_input)

        # feed forward layer
        output = self.feedforward(gru_output)

        # 映射回item_size空间，最后一个维度的embedding_size -> input_size = item_size
        output = output @ self.item_embedding.weight.T

        return output.reshape(-1, output.shape[-1])

    def predict(self, item_seq):
        """
        Params:
            item_seq, shape = (batch_size, seq_len)
        Return:
            final_score, shape = (batch_size, item_size)
        """
        self.eval()
        with torch.no_grad():
            seq_embedding = self.item_embedding(item_seq)  # [bs, seq_len, embedding_size]
            gru_output, _ = self.gru(seq_embedding)  # [bs, seq_len, hidden_size]
            gru_output = self.feedforward(gru_output)  # [bs, seq_len, embedding_size]
            scores = gru_output @ self.item_embedding.weight.T  # [bs, seq_len, item_size]
        self.train()
        final_score = scores[:, -1, :]  # 取最后一个timestep，[bs, item_size]
        return final_score


In [3]:
class GetResult:
    def __init__(self, dataset='Beauty', best_model='gru4rec'):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.dataset = dataset
        _ = pre_process(file=dataset)
        self.item2index = _[2]
        self.index2item = _[3]
        self.test_data = self.process_data()
        self.loader = self.get_batches(batch_size=128)

        # 建立和读取model
        if self.device == "cuda":
            state_dict = torch.load(f'pretrained_model/{dataset}/{best_model}.pt')
        else:
            state_dict = torch.load(f'pretrained_model/{dataset}/{best_model}.pt',
                                    map_location=torch.device('cpu'))
        item_size = state_dict['item_embedding.weight'].shape[0]
        self.model = GRU4REC(input_size=item_size,
                             output_size=item_size,
                             embedding_size=128,
                             hidden_size=128,
                             dp=0).to(self.device)
        self.model.load_state_dict(state_dict)

    def process_data(self):
        data = pd.read_csv(f"./dataset/Amazon_{self.dataset}/test_sessions.csv")
        data["session"] = data.session.apply(eval)

        # 将item编码为整数
        item2index = self.item2index
        def f1(input):
            if isinstance(input, list):
                return [item2index[input[i]] for i in range(len(input))]
            else:
                return item2index[input]

        data["session"] = data["session"].apply(f1)
        data = data.reset_index(drop=True)
        return data

    def get_batches(self, batch_size):
        dataset = self.test_data
        # indices，记录每条session的索引和长度
        indices = [(i, len(s)) for i, s in enumerate(dataset.session)]
        random.shuffle(indices)
        # 以100*batch_size为一组, 按照session长度进行排序后加入pooled_indices
        pooled_indices = []
        for i in range(0, len(indices), batch_size*100):
            curr_data_indices = sorted(indices[i: i + batch_size*100],
                                       key=lambda x: x[1],
                                       reverse=True)
            pooled_indices.extend(curr_data_indices)
        pooled_indices = [x[0] for x in pooled_indices]
        # 分的每一块作为一个batch，得到对应的索引
        batches_idx = []
        for i in range(0, len(pooled_indices), batch_size):
            batches_idx.append(pooled_indices[i: i + batch_size])
        # 根据索引获取batch数据
        batches = []
        for batch_idx in batches_idx:
            batch = dataset.loc[batch_idx, ['session']]
            rec_list = []
            for record in batch.session:
                rec_list.append(torch.tensor(record))
            # 用0填充
            X = pad_sequence(rec_list, padding_value=0, batch_first=True)  # [batch_size, max_seq_len]
            batches.append(X)
        return batches

    def get_result(self):
        def remove_padding(s):
            ss = []
            for item in s:
                if item != 0:
                    ss.append(item)
            return ss

        session = []
        predict = []
        for batch in self.loader:
            X = batch.to(self.device)  # [batch_size, seq_len]
            scores = self.model.predict(X)  # [batch_size, item_size]

            for i in range(len(X)):
                # 去除零填充，获取session
                s_i = X[i].tolist()  # [seq_len]
                session_i = remove_padding(s_i)
                session.append(session_i)

                # 获取该session的预测top@20, 除去item 0
                score_i = scores[i]  # [item_size]
                top_i = score_i.argsort(descending=True).tolist()
                if 0 not in top_i[:20]:
                    top_k = top_i[:20]
                else:
                    top_k = top_i[:21]
                    top_k.remove(0)
                predict.append(top_k)

        # 转换为原始的item id
        index2item = self.index2item
        def f(input):
            if isinstance(input, list):
                return [index2item[input[i]] for i in range(len(input))]
            else:
                return index2item[input]

        test_result = {'session': session, 'predict': predict}
        test_result = pd.DataFrame(test_result)
        test_result['session'] = test_result['session'].apply(f)
        test_result['predict'] = test_result['predict'].apply(f)

        # 转换为字符形式
        test_result['session'] = test_result['session'].apply(lambda x: str(x))
        test_result['predict'] = test_result['predict'].apply(lambda x: str(x))

        return test_result

In [4]:
beauty = GetResult(dataset='Beauty').get_result()
cell = GetResult(dataset='Cell').get_result()





In [11]:
beauty.to_csv('results/test_result_beauty.csv', index=False)
cell.to_csv('results/test_result_cell.csv', index=False)