# NCF

Done without book metadata, using solely the rating given by users for books read in the past

## Import Libraries

In [115]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

## Read Data

In [117]:
train_data = pd.read_csv('train_interactions.csv')
test_data = pd.read_csv('test_interactions.csv')

In [118]:
print(train_data.shape)
train_data.head(3)

(134745, 7)


Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,157993,5,2016-07-18 19:34:48+00:00,,3004,0.574139
1,0,359079,4,2014-07-16 19:28:57+00:00,,3737,0.527973
2,0,41684,4,2014-07-16 13:45:50+00:00,,3738,0.527973


In [119]:
print(test_data.shape)
test_data.head(3)

(41596, 7)


Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,343002,5,2012-01-25 00:26:06+00:00,,4641,0.574139
1,0,1852,4,2012-01-21 18:36:06+00:00,,4644,0.527973
2,1,1248128,4,2009-05-04 15:32:21+00:00,,5636,0.527973


In [143]:
for user_id in range(5): 
    user_train = train_data[train_data['user_id'] == user_id]
    user_test = test_data[test_data['user_id'] == user_id]
    print(f"\nUser {user_id}:")
    print(f"  Train samples: {len(user_train)}")
    print(f"  Test samples: {len(user_test)}")
    print(f"  Train ratio: {len(user_train) / (len(user_train) + len(user_test)):.2f}")


User 0:
  Train samples: 7
  Test samples: 2
  Train ratio: 0.78

User 1:
  Train samples: 10
  Test samples: 2
  Train ratio: 0.83

User 2:
  Train samples: 7
  Test samples: 2
  Train ratio: 0.78

User 3:
  Train samples: 12
  Test samples: 2
  Train ratio: 0.86

User 4:
  Train samples: 7
  Test samples: 2
  Train ratio: 0.78


In [123]:
user_encoder = LabelEncoder()
book_encoder = LabelEncoder()

test_user_encoder = LabelEncoder()
test_book_encoder = LabelEncoder()

train_data['user_id'] = user_encoder.fit_transform(train_data['user_id'])
train_data['book_id'] = book_encoder.fit_transform(train_data['book_id'])

test_data['user_id'] = test_user_encoder.fit_transform(test_data['user_id'])
test_data['book_id'] = test_book_encoder.fit_transform(test_data['book_id'])

### Dataloader

In [124]:
class BooksDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            'user_id': torch.tensor(row['user_id'], dtype=torch.long),
            'book_id': torch.tensor(row['book_id'], dtype=torch.long),
            'rating': torch.tensor(row['rating'], dtype=torch.float)
        }

# Create DataLoader
batch_size = 512

train_dataset = BooksDataset(train_data)
test_dataset = BooksDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

### NeuMF model with GMF and MLP

In [125]:
# Define the NCF model with GMF and MLP for Books Data
class NCF(nn.Module):
    def __init__(self, num_users, num_books, embedding_dim):
        super(NCF, self).__init__()
        # GMF Components
        self.user_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
        self.book_embeddings_gmf = nn.Embedding(num_books, embedding_dim)

        # MLP Components
        self.user_embeddings_mlp = nn.Embedding(num_users, embedding_dim)
        self.book_embeddings_mlp = nn.Embedding(num_books, embedding_dim)

        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.fc2_mlp = nn.Linear(128, 64)

        # Final layers
        self.fc1_combined = nn.Linear(embedding_dim + 64, 128)
        self.fc2_combined = nn.Linear(128, 1)

    def forward(self, user_id, book_id):
        # GMF
        user_emb_gmf = self.user_embeddings_gmf(user_id)
        book_emb_gmf = self.book_embeddings_gmf(book_id)
        gmf_output = user_emb_gmf * book_emb_gmf

        # MLP
        user_emb_mlp = self.user_embeddings_mlp(user_id)
        book_emb_mlp = self.book_embeddings_mlp(book_id)
        mlp_input = torch.cat([user_emb_mlp, book_emb_mlp], dim=-1)
        mlp_output = torch.relu(self.fc1_mlp(mlp_input))
        mlp_output = torch.relu(self.fc2_mlp(mlp_output))

        # Combine GMF and MLP outputs
        combined_input = torch.cat([gmf_output, mlp_output], dim=-1)
        combined_output = torch.relu(self.fc1_combined(combined_input))
        combined_output = torch.relu(self.fc2_combined(combined_output)) * 4 + 1  # To match rating scale

        return combined_output.squeeze()

### Model Training

In [139]:
# Instantiate and train the model
embedding_dim = 32
num_users = len(user_encoder.classes_) 
num_books = len(book_encoder.classes_)

model = NCF(num_users, num_books, embedding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.05)

# Training
num_epochs = 5
for epoch in range(num_epochs):
    # print('epoch:', epoch)
    model.train()
    epoch_loss = 0
    batch_no = 1
    for batch in train_loader:
        user_id = batch['user_id']
        book_id = batch['book_id']
        rating = batch['rating']

        optimizer.zero_grad()
        outputs = model(user_id, book_id)
        loss = criterion(outputs, rating)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        batch_no += 1

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_data)}')

# Evaluation
model.eval()
test_loss = 0
with torch.no_grad():
    total_loss = 0
    for batch in test_loader:
        user_id = batch['user_id']
        book_id = batch['book_id']
        rating = batch['rating']

        outputs = model(user_id, book_id)
        loss = criterion(outputs, rating)

        total_loss += loss.item()

    test_loss = total_loss / len(test_data)
    print(f'Test Loss: {test_loss}')

Epoch 1/5, Loss: 0.04890962634715987
Epoch 2/5, Loss: 0.006615352811094622
Epoch 3/5, Loss: 0.003467713603239439
Epoch 4/5, Loss: 0.002956988871741186
Epoch 5/5, Loss: 0.0024362993021834834
Test Loss: 0.0051066067335937835


### Generate Predictions

Use trained GMF+MLP model to generate predictions on the test set.

In [140]:
def generate_predictions(model, data_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            user_id = batch['user_id']
            book_id = batch['book_id']
            outputs = model(user_id, book_id)
            predictions.extend(zip(user_id.numpy(), book_id.numpy(), outputs.numpy()))
    return predictions

# Generate predictions
test_predictions = generate_predictions(model, test_loader)

In [141]:
print(test_predictions)

[(0, 1220, 3.2185843), (0, 5, 4.9290094), (1, 2484, 4.4151907), (1, 234, 1.0), (2, 441, 3.8454907), (2, 3014, 5.5450864), (3, 234, 5.257161), (3, 787, 4.6210647), (4, 109, 5.0095797), (4, 787, 4.7950544), (5, 41, 4.816895), (5, 38, 4.0051475), (6, 233, 4.9465847), (6, 234, 4.418457), (7, 0, 4.205568), (7, 5, 4.74695), (8, 2042, 4.935831), (8, 0, 5.2371645), (9, 87, 5.096698), (9, 234, 4.3955536), (10, 1124, 2.7922661), (10, 255, 4.2787943), (11, 726, 1.0), (11, 0, 4.0417976), (12, 726, 4.542616), (12, 14, 4.966694), (13, 155, 1.2501192), (13, 186, 3.9554253), (14, 1278, 3.4316936), (14, 234, 3.2603307), (15, 669, 3.2123435), (15, 4368, 5.773731), (16, 175, 4.592211), (16, 326, 4.0706663), (17, 178, 5.817785), (17, 1278, 4.6261473), (18, 1317, 3.2902114), (18, 4648, 4.9105806), (19, 188, 3.7396202), (19, 101, 4.0998855), (20, 326, 1.4072778), (20, 0, 4.8105764), (21, 205, 1.0), (21, 2561, 1.0), (22, 277, 3.412629), (22, 93, 1.0), (23, 4812, 4.7688646), (23, 3250, 4.315563), (24, 162, 5.

In [129]:
test_data.head(3)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,1220,5,2012-01-25 00:26:06+00:00,,4641,0.574139
1,0,5,4,2012-01-21 18:36:06+00:00,,4644,0.527973
2,1,2484,4,2009-05-04 15:32:21+00:00,,5636,0.527973


In [130]:
# Convert test data to DataFrame for easier manipulation
test_data_df = pd.DataFrame(test_data, columns=['user_id', 'book_id', 'rating'])

# Map actual ratings to relevance scores
test_data_df['relevance'] = test_data_df['rating'].apply(lambda x: x if x > 0 else 0)  # Example relevance mapping

In [137]:
def dcg_at_k(relevance_scores, k):
    """
    Compute the Discounted Cumulative Gain (DCG) at rank position k.

    Parameters:
    - relevance_scores: List of relevance scores for items at the top-k positions
    - k: Rank position to compute DCG

    Returns:
    - DCG at rank k
    """
    relevance_scores = np.asarray(relevance_scores)[:k]
    if relevance_scores.size:
        return np.sum((2**relevance_scores - 1) / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.

def ndcg_at_k(relevance_scores, ideal_relevance_scores, k):
    """
    Compute the Normalized Discounted Cumulative Gain (NDCG) at rank position k.

    Parameters:
    - relevance_scores: List of relevance scores for the predicted ranking
    - ideal_relevance_scores: List of relevance scores for the ideal ranking
    - k: Rank position to compute NDCG

    Returns:
    - NDCG at rank k
    """
    dcg = dcg_at_k(relevance_scores, k)
    idcg = dcg_at_k(ideal_relevance_scores, k)
    if idcg == 0:
        return 0.
    return dcg / idcg


In [142]:
def compute_ndcg_for_user(user_id, predictions_df, k=10):
    user_predictions = predictions_df[predictions_df['user_id'] == user_id]
    user_predictions = user_predictions.sort_values(by='predicted_rating', ascending=False)

    # Get relevance scores in the predicted order
    relevance_scores = user_predictions['relevance'].tolist()

    # Ideal relevance scores
    ideal_relevance_scores = sorted(relevance_scores, reverse=True)

    return ndcg_at_k(relevance_scores, ideal_relevance_scores, k)


def compute_overall_ndcg(test_data_df, predictions, k=10):
    predictions_df = pd.DataFrame(predictions, columns=['user_id', 'book_id', 'predicted_rating'])
    merged_df = pd.merge(test_data_df[['user_id', 'book_id', 'relevance']], predictions_df, on=['user_id', 'book_id'], how='inner')

    if len(merged_df) < len(test_data_df):
        print(f"Warning: Some test data points ({len(test_data_df) - len(merged_df)}) could not be matched with predictions.")

    user_ids = merged_df['user_id'].unique()
    ndcg_scores = [compute_ndcg_for_user(user_id, merged_df, k) for user_id in user_ids]

    return np.mean(ndcg_scores)

# Compute NDCG for test set
overall_ndcg = compute_overall_ndcg(test_data_df, test_predictions)
print(f"Overall NDCG@10: {overall_ndcg}")

Overall NDCG@10: 0.9450921705690265
