In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

# Load datasets
df_children_books_final_encoded = pd.read_csv('df_children_books_final.csv')
df_interactions_train = pd.read_csv('df_interactions_train.csv')
df_interactions_val = pd.read_csv('df_interactions_val.csv')
df_interactions_test = pd.read_csv('test_interactions.csv')



In [None]:
# Step 1: Preprocess Data
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# Encode user and item ids
df_interactions_train['user_id'] = user_encoder.fit_transform(df_interactions_train['user_id'])
df_interactions_train['item_id'] = item_encoder.fit_transform(df_interactions_train['item_id'])
df_interactions_val['user_id'] = user_encoder.transform(df_interactions_val['user_id'])
df_interactions_val['item_id'] = item_encoder.transform(df_interactions_val['item_id'])

# Sort interactions by timestamp if available to maintain sequential data

df_interactions_train = df_interactions_train.sort_values(by=['user_id', 'review_age'])
df_interactions_val = df_interactions_val.sort_values(by=['user_id', 'review_age'])

# Group interactions by user to create sequences
user_item_sequences = df_interactions_train.groupby('user_id')['item_id'].apply(list)
val_user_item_sequences = df_interactions_val.groupby('user_id')['item_id'].apply(list)



In [None]:
# Step 2: Define Dataset Class for RNN input
class InteractionDataset(Dataset):
    def __init__(self, user_sequences, sequence_length=10):
        self.user_sequences = user_sequences
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.user_sequences)

    def __getitem__(self, idx):
        sequence = self.user_sequences[idx]
        sequence = sequence[-self.sequence_length:]  # Get last N items for each sequence
        x = torch.tensor(sequence[:-1], dtype=torch.long)  # All except the last item
        y = torch.tensor(sequence[1:], dtype=torch.long)   # All except the first item
        return x, y

class ValidationDataset(Dataset):
    def __init__(self, user_sequences, sequence_length=10):
        self.user_sequences = user_sequences
        self.sequence_length = sequence_length

    def __len__(self):
        return len(self.user_sequences)

    def __getitem__(self, idx):
        sequence = self.user_sequences[idx]
        sequence = sequence[-self.sequence_length:]  # Get last N items
        x = torch.tensor(sequence[:-1], dtype=torch.long)  # Input sequence
        y = torch.tensor(sequence[-1], dtype=torch.long)   # Next item to predict
        return x, y
    
# Prepare DataLoader
sequence_length = 10
dataset = InteractionDataset(list(user_item_sequences), sequence_length=sequence_length)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)



In [None]:
# Step 3: Define the RNN Model
class RNNRecommendationModel(nn.Module):
    def __init__(self, num_items, embedding_dim=50, hidden_dim=100):
        super(RNNRecommendationModel, self).__init__()
        self.embedding = nn.Embedding(num_items, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_items)
    
    def forward(self, x):
        x = self.embedding(x)  # Embed item indices
        rnn_out, _ = self.rnn(x)  # RNN output
        logits = self.fc(rnn_out)  # Fully connected layer to output predictions
        return logits

# Initialize Model, Loss, and Optimizer
num_items = len(item_encoder.classes_)
model = RNNRecommendationModel(num_items)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 4: Training Loop
epochs = 5
for epoch in range(epochs):
    total_loss = 0
    for x, y in dataloader:
        optimizer.zero_grad()
        output = model(x)
        # Reshape for loss calculation
        loss = criterion(output.view(-1, num_items), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}")


## Evaluation on Validation Set

In [None]:
val_dataset = ValidationDataset(list(val_user_item_sequences), sequence_length=sequence_length)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define evaluation function
def evaluate_model(model, dataloader, top_k=10):
    model.eval()
    total, correct = 0, 0
    top_k_correct = 0
    with torch.no_grad():
        for x, y in dataloader:
            output = model(x)
            # Get the last time step's predictions (last item in sequence prediction)
            preds = output[:, -1, :]  # Shape: (batch_size, num_items)
            
            # Calculate top-1 accuracy
            _, top1_preds = torch.max(preds, 1)
            correct += (top1_preds == y).sum().item()
            
            # Calculate top-k accuracy
            topk_preds = torch.topk(preds, k=top_k, dim=1)[1]
            top_k_correct += sum([y[i].item() in topk_preds[i] for i in range(len(y))])

            total += y.size(0)
    
    top1_accuracy = correct / total
    topk_accuracy = top_k_correct / total
    print(f"Top-1 Accuracy: {top1_accuracy:.4f}")
    print(f"Top-{top_k} Accuracy: {topk_accuracy:.4f}")

# Step 6: Evaluate the model on the validation set
evaluate_model(model, val_dataloader)