## macos mps

In [1]:
import sys
import surprise

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var,
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    get_top_k_items,
)
from recommenders.models.surprise.surprise_utils import (
    predict,
    compute_ranking_predictions,
)
from recommenders.utils.notebook_utils import store_metadata

In [2]:
print(f"System version: {sys.version}")
print(f"Surprise version: {surprise.__version__}")

System version: 3.9.20 (main, Oct  3 2024, 02:24:59) 
[Clang 14.0.6 ]
Surprise version: 1.1.4


## load data

In [3]:
import pandas as pd
train_df = pd.read_csv('datasets/ml-latest-small/train.csv')
test_df = pd.read_csv('datasets/ml-latest-small/test.csv')

print(train_df.head())
print(test_df.head())

   Unnamed: 0  userId  movieId  rating  timestamp
0          43       1      804     4.0  964980499
1          73       1     1210     5.0  964980499
2         171       1     2628     4.0  964980523
3         183       1     2826     4.0  964980523
4         120       1     2018     5.0  964980523
   Unnamed: 0  userId  movieId  rating  timestamp
0          76       1     1219     2.0  964983393
1         174       1     2644     4.0  964983393
2          91       1     1348     4.0  964983393
3         176       1     2654     5.0  964983393
4          83       1     1258     3.0  964983414


## training

In [4]:
from surprise import Dataset, Reader, SVD, accuracy

In [5]:
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))

train_set = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader).build_full_trainset()
test_set = list(zip(test_df['userId'], test_df['movieId'], test_df['rating']))

### svd

In [6]:
svd = SVD(random_state=0, n_factors=200, n_epochs=30, verbose=False)
svd.fit(train_set)

test_predictions = svd.test(test_set)

test_rmse = accuracy.rmse(test_predictions)
print(f"testset RMSE: {test_rmse}")

RMSE: 0.9038
testset RMSE: 0.9037522276885648


### knn

In [8]:
from surprise import KNNBaseline, KNNWithMeans
knn_baseline = KNNBaseline(sim_options={'name': 'pearson_baseline','user_based': False })
knn_baseline.fit(train_set)

knn_mean = KNNWithMeans(sim_options={'name':'cosine', 'user_based': True})
knn_mean.fit(train_set)

test_pred_knn_bl = knn_baseline.test(test_set)
test_rmse_knn_bl = accuracy.rmse(test_pred_knn_bl)
print(f"knn baseline RMSE: {test_rmse_knn_bl}")

test_pred_knn_mean = knn_mean.test(test_set)
test_rmse_knn_mean = accuracy.rmse(test_pred_knn_mean)
print(f"knn mean RMSE: {test_rmse_knn_mean}")



Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8894
knn baseline RMSE: 0.889403935617733
RMSE: 0.9494
knn mean RMSE: 0.9494347668407263


### transformer

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from tqdm import tqdm
import numpy as np
import pandas as pd
from transformers import BertModel, BertConfig
from transformers import get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [19]:
# Initialize encoders
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# Combine and encode data
combined_df = pd.concat([train_df, test_df], axis=0)
combined_df['user'] = user_encoder.fit_transform(combined_df['userId'])
combined_df['item'] = item_encoder.fit_transform(combined_df['movieId'])

num_users = combined_df['user'].nunique()
num_items = combined_df['item'].nunique()

# Split back into train and test
train_df = combined_df.iloc[:len(train_df)]
test_df = combined_df.iloc[len(train_df):]

In [20]:
# Sort and create sequences
train_df = train_df.sort_values(['user', 'timestamp'])
user_sequences = train_df.groupby('user')['item'].apply(list).reset_index()
print(user_sequences.head())

# Create input sequences and targets
max_seq_length = 20 
input_sequences = []
target_items = []

for seq in user_sequences['item']:
    if len(seq) < 2:
        continue
    for i in range(1, len(seq)):
        start = max(i - (max_seq_length - 1), 0)
        input_seq = seq[start:i]
        target = seq[i]
        input_sequences.append(input_seq)
        target_items.append(target)

# Split into train and validation sets
train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    input_sequences, target_items, test_size=0.1, random_state=42
)
print(f"Training set size: {len(train_inputs)}")
print(f"Validation set size: {len(val_inputs)}")

   user                                               item
0     0  [632, 910, 1978, 2125, 1492, 2670, 2692, 2798,...
1     1  [277, 7355, 8810, 8532, 8045, 8448, 2670, 7750...
2     2  [973, 1566, 1053, 696, 961, 1543, 30, 585, 276...
3     3  [143, 684, 1001, 361, 1752, 2224, 2035, 135, 5...
4     4  [508, 509, 123, 257, 337, 398, 506, 302, 126, ...
Training set size: 72257
Validation set size: 8029


In [21]:
class BERT4Rec(nn.Module):
    def __init__(self, num_items, hidden_size=128, num_layers=2, num_heads=4, 
                 max_position_embeddings=20, dropout=0.1):
        super(BERT4Rec, self).__init__()
        config = BertConfig(
            vocab_size=num_items + 3,  # Including [PAD], [BOS], [EOS]
            hidden_size=hidden_size,
            num_hidden_layers=num_layers,
            num_attention_heads=num_heads,
            intermediate_size=hidden_size * 4,
            max_position_embeddings=max_position_embeddings,
            hidden_dropout_prob=dropout,
            attention_probs_dropout_prob=dropout,
            type_vocab_size=1,
            output_attentions=False,
            output_hidden_states=False,
        )
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(hidden_size, num_items + 3)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        last_hidden_state = outputs.last_hidden_state
        pooled_output = self.dropout(last_hidden_state)
        logits = self.classifier(pooled_output)
        return logits

    def predict_next_item(self, input_ids, attention_mask):
        logits = self.forward(input_ids, attention_mask)
        next_logits = logits[:, -1, :]
        return next_logits

In [22]:
class ItemTokenizer:
    def __init__(self, num_items):
        self.num_items = num_items
        self.pad_token_id = 0
        self.bos_token_id = 1
        self.eos_token_id = 2
        self.vocab_size = num_items + 3

    def encode_plus(self, sequence, add_special_tokens=True, max_length=None, 
                   padding='max_length', truncation=True, return_attention_mask=True,
                   return_tensors='pt'):
        if not sequence:
            sequence = [self.pad_token_id]
            
        if add_special_tokens:
            tokens = [self.bos_token_id] + sequence + [self.eos_token_id]
        else:
            tokens = sequence
            
        if padding == 'max_length' and max_length is not None:
            pad_length = max_length - len(tokens)
            if pad_length > 0:
                tokens = tokens + [self.pad_token_id] * pad_length
            elif truncation:
                tokens = tokens[:max_length]
                
        attention_mask = [1] * len(tokens)
        if padding == 'max_length' and max_length is not None:
            attention_mask = attention_mask + [0] * (max_length - len(attention_mask))
            
        if return_tensors == 'pt':
            tokens = torch.tensor(tokens, dtype=torch.long)
            attention_mask = torch.tensor(attention_mask, dtype=torch.long)
            
        return {
            'input_ids': tokens,
            'attention_mask': attention_mask if return_attention_mask else None
        }

class RecDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        seq = self.inputs[idx]
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            [item + 1 for item in seq],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(target, dtype=torch.long)
        }

In [26]:
def train_epoch(model, data_loader, criterion, optimizer, device):
    model.train()
    losses = []
    correct = 0
    total = 0

    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        last_logits = logits[:, -1, :]

        loss = criterion(last_logits, labels)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        _, preds = torch.max(last_logits, dim=1)
        correct += torch.sum(preds == labels)
        total += labels.size(0)

    avg_loss = np.mean(losses)
    # accuracy = correct.double() / total
    accuracy = correct.float() / total
    return avg_loss, accuracy.item()

def eval_model(model, data_loader, criterion, device):
    model.eval()
    losses = []
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            last_logits = logits[:, -1, :]

            loss = criterion(last_logits, labels)
            losses.append(loss.item())

            _, preds = torch.max(last_logits, dim=1)
            correct += torch.sum(preds == labels)
            total += labels.size(0)

    avg_loss = np.mean(losses)
    # accuracy = correct.double() / total
    accuracy = correct.float() / total
    return avg_loss, accuracy.item()

In [24]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")


# Initialize tokenizer and datasets
tokenizer = ItemTokenizer(num_items)
train_dataset = RecDataset(train_inputs, train_targets, tokenizer, max_seq_length)
val_dataset = RecDataset(val_inputs, val_targets, tokenizer, max_seq_length)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=0)

# Initialize model
model = BERT4Rec(num_items=num_items)
model = model.to(device)

# Setup training
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

EPOCHS = 5
total_steps = len(train_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

Using device mps


In [25]:
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)

    train_loss, train_acc = train_epoch(
        model,
        train_loader,
        criterion,
        optimizer,
        device
    )
    print(f"Training loss: {train_loss:.4f}, Training accuracy: {train_acc:.4f}")

    val_loss, val_acc = eval_model(
        model,
        val_loader,
        criterion,
        device
    )
    print(f"Validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}")


Epoch 1/5
----------


Training: 100%|██████████| 2259/2259 [00:48<00:00, 46.76it/s]


TypeError: Cannot convert a MPS Tensor to float64 dtype as the MPS framework doesn't support float64. Please use float32 instead.

In [None]:
def get_topk_accuracy(model, data_loader, device, k=10):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Computing Top-K Accuracy"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            last_logits = logits[:, -1, :]
            topk_preds = torch.topk(last_logits, k, dim=1).indices
            
            labels = labels.view(-1, 1)
            correct += torch.sum(topk_preds == labels)
            total += labels.size(0)

    # topk_acc = correct.double() / total
    topk_acc = correct.float() / total
    return topk_acc.item()

# Compute Top-K accuracy
topk = 10
topk_acc = get_topk_accuracy(model, val_loader, device, k=topk)
print(f"Validation Top-{topk} accuracy: {topk_acc:.4f}")

### lightGCN

In [None]:
from torch.utils.data import Dataset, DataLoader

class MovieLensDataset(Dataset):
    def __init__(self, df):
        self.user_indices = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.item_indices = torch.tensor(df['item_idx'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_indices[idx], self.item_indices[idx], self.ratings[idx]

In [None]:
batch_size = 1024
train_dataset = MovieLensDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = MovieLensDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
from recommenders.models.graph_based.lightgcn import LightGCN

# 定义模型参数
embedding_dim = 20  # 嵌入维度
num_layers = 3      # 图卷积层数

# 实例化 LightGCN 模型
model = LightGCN(
    n_users=num_users,
    n_items=num_items,
    embedding_dim=embedding_dim,
    num_layers=num_layers,
    learning_rate=0.01,
    weight_decay=1e-4,
    device=device
)

# 训练模型
model.fit(
    train=train_df,
    test=test_df,
    epochs=10,
    verbose=True
)

In [None]:
results = model.evaluate(test_df)
print(f"测试集评估结果：{results}")