# Preparation

In [None]:
import torch
torch.__version__

In [None]:
import os
import pandas as pd
import numpy as np
import random
from torch import nn
import torch.optim as optim
from tqdm import tqdm

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True
        torch.backends.cudnn.enabled = True
seed_everything(2024)

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

# Construct network

In [None]:
class RecommenderModel(nn.Module):
    def __init__(self, config):
        super(RecommenderModel, self).__init__()
        self.config = config
        self.X = config["layers_num(X)"]
        self.model_type = config['model_type']

        self.embedding_user = torch.nn.Embedding(num_embeddings=config['num_users'], embedding_dim=config['latent_dim'])
        self.embedding_item = torch.nn.Embedding(num_embeddings=config['num_items'], embedding_dim=config['latent_dim'])

        if self.model_type == "GMF":
            self.linear_gmf = torch.nn.Linear(in_features=config['latent_dim'], out_features=1, bias = False)
        else:
            ### the mlp part is necessary for `NeuMF` and `MLP` model
            if self.config["layers_num(X)"] != 0:
                self.fc_layers = torch.nn.ModuleList()
                for idx, (in_size, out_size) in enumerate(zip(self.config['layers'][:-1], self.config['layers'][1:])):
                    self.fc_layers.append(torch.nn.Linear(in_size, out_size,bias = True))

            if self.model_type == "MLP":
                self.linear_mlp = torch.nn.Linear(in_features=self.config['layers'][-1], out_features=1, bias = False)
                self.linear_mlp0 = torch.nn.Linear(in_features=self.config['latent_dim']*2, out_features=1, bias = False)
            elif self.model_type == "NeuMF":
                self.linear_neumf = torch.nn.Linear(in_features=self.config['latent_dim']*2+self.X, out_features=1,bias = False)
        self.relu = torch.nn.ReLU()
        self.sigmoid = torch.nn.Sigmoid()


    def forward(self, user_indices, item_indices):
        if self.model_type == 'GMF':
            user_embedding_gmf = self.embedding_user(user_indices)
            item_embedding_gmf = self.embedding_item(item_indices)
            vector = torch.mul(user_embedding_gmf, item_embedding_gmf)
            vector = self.linear_gmf(vector)
        else:
            #* initiate MLP part
            user_embedding_mlp = self.embedding_user(user_indices)
            item_embedding_mlp = self.embedding_item(item_indices)

            #* `MLP` or `NeuMF`
            if self.model_type == 'MLP':    ### only in `MLP` model, `X` can be 0
                vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)
                if self.X != 0:
                    for fc_layer in self.fc_layers:
                        vector = fc_layer(vector)
                        vector = self.relu(vector)
                    vector = self.linear_mlp(vector)
                else:
                    vector = self.linear_mlp0(vector)
            elif self.model_type == 'NeuMF':
                ################# for gmf
                user_embedding_gmf = self.embedding_user(user_indices.long())
                item_embedding_gmf = self.embedding_item(item_indices.long())
                gmf_vector = torch.mul(user_embedding_gmf, item_embedding_gmf)

                #################  for mlp
                mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1)
                for fc_layer in self.fc_layers:
                    mlp_vector = fc_layer(mlp_vector)
                    mlp_vector = self.relu(mlp_vector)
                mlp_vector = self.linear_mlp(mlp_vector)

                #################  concat mf and mlp vector
                vector = torch.cat([gmf_vector, mlp_vector], dim=-1)
                vector = self.linear_neumf(vector)

        output = self.sigmoid(vector)
        return output.squeeze(1)


# Load data & data preprocess

In [None]:
from torch.utils.data import Dataset, DataLoader
import scipy.sparse as sp

class RatingDataset(Dataset):
    def __init__(self, rating_mat, negative_list, user_num, item_num, negative_num=4):
        self.user_num = user_num
        self.item_num = item_num

        # for positive samples
        row_idx, col_idx = rating_mat.nonzero()
        self.user_ids = row_idx.tolist()
        self.item_ids = col_idx.tolist()
        # self.ratings = rating_mat[row_idx, col_idx].toarray().astype(np.float32)
        self.labels = np.ones(len(row_idx)).tolist()

        # print(self.user_num,len(negative_list))
        # extend list for negative samples
        for i in range(self.user_num):
            negatives_items = random.sample(negative_list[i],negative_num)
            self.user_ids.extend([i,i,i,i])
            self.item_ids.extend(negatives_items)
            self.labels.extend([0,0,0,0])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # get user id and item id
        user_id = self.user_ids[idx]
        item_id = self.item_ids[idx]
        labels = self.labels[idx]

        # create one hot vector (may not necessary, here I comment it because the embedding only need indices)
        # user_vec = torch.zeros(self.user_num,dtype = torch.float32)
        # item_vec = torch.zeros(self.item_num,dtype = torch.float32)
        # user_vec[user_id] = 1
        # item_vec[item_id] = 1
        # return user_vec, item_vec, labels
        return user_id, item_id, labels

def load_rating_file_as_sparse(filename):
    num_users, num_items = 0, 0
    with open(filename, "r") as f:
        for line in f:
            arr = line.split("\t")
            user, item = int(arr[0]), int(arr[1])
            num_users = max(num_users, user)
            num_items = max(num_items, item)

    num_users += 1
    num_items += 1
    mat = sp.dok_matrix((num_users, num_items), dtype=np.float32)
    with open(filename, "r") as f:
        for line in f:
            arr = line.split("\t")
            user, item, rating = int(arr[0]), int(arr[1]), float(arr[2])
            if rating > 0:
                mat[user, item] = rating
    # print(num_users,num_items)
    return mat.tocsr(), num_users, num_items

def load_negative_file(filename):
    negativeList = []
    with open(filename, "r") as f:
        line = f.readline()
        while line != None and line != "":
            arr = line.split("\t")
            negatives = []
            for x in arr[1: ]:
                negatives.append(int(x))
            negativeList.append(negatives)
            line = f.readline()
    return negativeList

In [None]:
# import pandas as pd
# df = pd.read_csv("Data/ml-1m.train.rating",sep='\t', header=None, names=['uid', 'mid', 'rating', 'timestamp'], engine='python')
# df

In [None]:
# df2 = pd.read_csv("Data/ml-1m.test.negative",sep = "\t",header=None)
# df2

# Trian and Test

In [None]:
BATCH_SIZE = 256
NUMEPOCHS = 50

In [None]:
rating_mat, num_of_user, num_of_item = load_rating_file_as_sparse("Data/ml-1m.train.rating")
negative_sample_list = load_negative_file("Data/ml-1m.test.negative")

train_dataset = RatingDataset(rating_mat, negative_sample_list, num_of_user, num_of_item, 9)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
config = {
    'num_users': num_of_user,
    'num_items': num_of_item,
    'latent_dim': 8,
    "layers_num(X)" : 4,
    'layers': [64, 32, 16, 8],
    'model_type': 'GMF'     #　MLP, NeuMF
}

model = RecommenderModel(config)
model = model.to(device)
print(model)

In [None]:
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
model.train()
for epoch in tqdm(range(NUMEPOCHS)):
    for user, item, label in train_dataloader:  # dataloader应返回用户ID、物品ID和标签（1表示正样本，0表示负样本）
        optimizer.zero_grad()
        user,item,label = user.to(device), item.to(device), label.float().to(device)
        output = model(user, item)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

## Reproduce the ablation study --- MLP with different layers

 K: 8, 16, 32, 64
 
 layers (X): 0, 1, 2, 3, 4