# NCF with Metadata of Books
Aiming to predicts books that a user is likely to read instead of ratings given the books that a user has read

In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from gensim.models import Word2Vec
from random import sample
from sklearn.metrics import ndcg_score

## Preparation of Books Metadata for Modeling

### Read books CSV

In [2]:
books_meta = pd.read_csv('df_books_final.csv')

In [3]:
books_meta.head(3)

Unnamed: 0,book_id,title,description,average_rating,ratings_count,text_reviews_count,top_popular_shelves,author_ids,format_Audio,format_Digital,...,lang_tha,lang_tr,lang_tur,lang_ukr,lang_vi,lang_vie,lang_zh,length_long,length_medium,length_short
0,1882090,"Behave Yourself, Bethany Brant",A preacher's daughter with lots of curiosity a...,-1.02863,-0.059069,-0.136278,"[{'count': '1', 'name': 'mrs-withdrawn'}, {'co...",['151369'],False,False,...,False,False,False,False,False,False,False,False,True,False
1,166120,Ghosthunters And The Incredibly Revolting Ghost,A $2.99 value-priced edition of one of our bes...,-0.690046,-0.059069,-0.126186,"[{'count': '9', 'name': 'owned'}, {'count': '9...",['15873'],False,False,...,False,False,False,False,False,False,False,False,True,False
2,8608741,Attack of the Chicken Nugget Man: A National T...,Third-grader Chris Robb just can't seem to do ...,-0.597705,-0.059069,-0.102639,"[{'count': '2', 'name': 'books-i-wrote'}, {'co...",['2868520'],False,False,...,False,False,False,False,False,False,False,False,True,False


### Vectorize Text Columns (title, description)

In [4]:
# # Create a TfidfVectorizer instance
# tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# # Fit and transform the titles
# title_tfidf = tfidf_vectorizer.fit_transform(books_meta['title'])

# # Convert each row of the sparse matrix to a compressed sparse row (CSR) format and store it in a new column
# books_meta['title_tfidf'] = [scipy.sparse.csr_matrix(title_tfidf[i]) for i in range(title_tfidf.shape[0])]

book_descriptions = books_meta['description']

# Tokenize book descriptions
tokenized_descriptions = [description.split() for description in book_descriptions]

# Train Word2Vec model
model = Word2Vec(tokenized_descriptions, vector_size=100, window=5, min_count=1, workers=4)

# To get a single vector for an entire description, average the word vectors
def get_vector(text):
    words = text.split()
    word_vectors = np.array([model.wv[word] for word in words if word in model.wv])
    return np.mean(word_vectors, axis=0) if word_vectors.size else np.zeros(model.vector_size)

# Vectorize descriptions
descriptions_vectors = np.array([get_vector(description) for description in book_descriptions])

# # Add Word2Vec for description back to dataframe
# books_meta['description_word2vec'] = list(descriptions_vectors)
vector_columns = pd.DataFrame(descriptions_vectors, columns=[f'vector_{i+1}' for i in range(descriptions_vectors.shape[1])])
# Concatenate the vector columns with the original df_books DataFrame
books_meta = pd.concat([books_meta, vector_columns], axis=1)

### Drop Redundant Columns

In [5]:
print(books_meta.shape)
books_meta.columns

(59828, 184)


Index(['book_id', 'title', 'description', 'average_rating', 'ratings_count',
       'text_reviews_count', 'top_popular_shelves', 'author_ids',
       'format_Audio', 'format_Digital',
       ...
       'vector_91', 'vector_92', 'vector_93', 'vector_94', 'vector_95',
       'vector_96', 'vector_97', 'vector_98', 'vector_99', 'vector_100'],
      dtype='object', length=184)

In [6]:
books_meta_clean = books_meta.drop(columns=['title', 'description', 'top_popular_shelves', 'author_ids'])

In [99]:
print(books_meta_clean.shape)
books_meta_clean.head(3)

(59828, 181)


Unnamed: 0,book_id,average_rating,ratings_count,text_reviews_count,format_Audio,format_Digital,format_Other,format_Physical,format_Specialty,lang_ar,...,vector_92,vector_93,vector_94,vector_95,vector_96,vector_97,vector_98,vector_99,vector_100,book_idx
0,1882090,-1.02863,-0.059069,-0.136278,False,False,False,True,False,False,...,-0.64785,1.280018,1.553704,-0.449838,1.632225,1.477504,-0.324744,0.15246,1.129145,24217
1,166120,-0.690046,-0.059069,-0.126186,False,False,False,True,False,False,...,-0.767153,1.153325,1.186553,-0.012707,1.047337,0.979032,-0.401169,0.630396,1.087497,3475
2,8608741,-0.597705,-0.059069,-0.102639,False,False,False,True,False,False,...,-0.654886,1.107435,1.617156,-0.08061,1.139693,0.823007,-0.49606,0.258614,1.189002,37931


## Import Train and Test Interactions CSV

In [66]:
train_data = pd.read_csv('train_interactions.csv')
test_data = pd.read_csv('test_interactions.csv')

In [67]:
train_data.head(3)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,157993,5,2016-07-18 19:34:48+00:00,,3004,0.574139
1,0,359079,4,2014-07-16 19:28:57+00:00,,3737,0.527973
2,0,41684,4,2014-07-16 13:45:50+00:00,,3738,0.527973


In [68]:
test_data.head(3)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,343002,5,2012-01-25 00:26:06+00:00,,4641,0.574139
1,0,1852,4,2012-01-21 18:36:06+00:00,,4644,0.527973
2,1,1248128,4,2009-05-04 15:32:21+00:00,,5636,0.527973


## Encoding User and Book IDs

In [69]:
user_encoder = LabelEncoder()
train_data['user_idx'] = user_encoder.fit_transform(train_data['user_id'])

# Retrieve all unique book IDs
all_book_ids = books_meta_clean['book_id'].unique()

# Fit the encoder on the combined unique book_ids
book_encoder = LabelEncoder()
book_encoder.fit(all_book_ids)

# Apply encoding to both DataFrames without refitting
books_meta_clean['book_idx'] = book_encoder.transform(books_meta_clean['book_id'])
train_data['book_idx'] = book_encoder.transform(train_data['book_id'])

In [70]:
train_data.head(1)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment,user_idx,book_idx
0,0,157993,5,2016-07-18 19:34:48+00:00,,3004,0.574139,0,3330


In [71]:
# Test book encoding
books_meta_clean[books_meta_clean['book_id'] == 157993]

Unnamed: 0,book_id,average_rating,ratings_count,text_reviews_count,format_Audio,format_Digital,format_Other,format_Physical,format_Specialty,lang_ar,...,vector_92,vector_93,vector_94,vector_95,vector_96,vector_97,vector_98,vector_99,vector_100,book_idx
59823,157993,1.187557,54.989371,55.828967,False,False,False,True,False,False,...,-0.730774,1.358876,1.626976,-0.041203,1.063568,1.315886,-0.33967,0.649069,1.342983,3330


In [72]:
# Check for missing data
train_data[~train_data['book_id'].isin(books_meta_clean['book_id'])]

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment,user_idx,book_idx


## Define Functions for NCF Model for Book Recommender

In [73]:
class BooksDataset(Dataset):
    def __init__(self, data, book_features, is_train=True):
        self.data = data  # This is the dataframe containing user_id, book_id, rating
        self.book_features = book_features  # Metadata for books
        self.is_train = is_train  # Flag to indicate if this is training or testing 
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        user_idx = row['user_idx']
        book_idx = row['book_idx']
        
        # Extract book metadata
        book_meta = torch.tensor(self.book_features.loc[book_idx].values.astype(np.float32), dtype=torch.float)
        
        # Prepare the data dictionary
        data_dict = {
            'user_idx': torch.tensor(user_idx, dtype=torch.long),
            'book_idx': torch.tensor(book_idx, dtype=torch.long),
            'rating' : torch.tensor(row['rating'], dtype=torch.float),
            'book_meta': book_meta
        }
        
        return data_dict


In [74]:
BATCH_SIZE = 512

# Assuming `train_data` is your DataFrame containing interactions (user_id, book_id, rating)
train_dataset = BooksDataset(train_data, books_meta_clean)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [76]:
class NCF(nn.Module):
    def __init__(self, num_users, num_books, embedding_dim, metadata_dim):
        super(NCF, self).__init__()
        # GMF Components
        self.user_embeddings_gmf = nn.Embedding(num_users, embedding_dim)
        self.book_embeddings_gmf = nn.Embedding(num_books, embedding_dim)

        # MLP Components
        self.user_embeddings_mlp = nn.Embedding(num_users, embedding_dim)
        self.book_embeddings_mlp = nn.Embedding(num_books, embedding_dim)

        # Metadata layer
        self.fc_meta = nn.Linear(metadata_dim, embedding_dim)

        # MLP layers
        self.fc1_mlp = nn.Linear(2 * embedding_dim, 128)
        self.fc2_mlp = nn.Linear(128, 64)

        # Final combination layer
        self.fc1_combined = nn.Linear(embedding_dim + 64 + embedding_dim, 128)  # Metadata embedding added
        self.fc2_combined = nn.Linear(128, 1)

    def forward(self, user_id, book_id, book_meta):
        # GMF
        user_emb_gmf = self.user_embeddings_gmf(user_id)
        book_emb_gmf = self.book_embeddings_gmf(book_id)
        gmf_output = user_emb_gmf * book_emb_gmf

        # MLP
        user_emb_mlp = self.user_embeddings_mlp(user_id)
        book_emb_mlp = self.book_embeddings_mlp(book_id)
        mlp_input = torch.cat([user_emb_mlp, book_emb_mlp], dim=-1)
        mlp_output = torch.relu(self.fc1_mlp(mlp_input))
        mlp_output = torch.relu(self.fc2_mlp(mlp_output))

        # Process metadata
        meta_output = torch.relu(self.fc_meta(book_meta))  # Project metadata to same embedding dimension

        # Combine GMF, MLP, and metadata outputs
        combined_input = torch.cat([gmf_output, mlp_output, meta_output], dim=-1)
        combined_output = torch.relu(self.fc1_combined(combined_input))
        # combined_output = torch.sigmoid(self.fc2_combined(combined_output)) * 5  # Scale to rating 
        
        combined_output = self.fc2_combined(combined_output)  # No activation
        combined_output = torch.clamp(combined_output, 1, 5)  # Ensure output is in rating range


        return combined_output.squeeze()

## Model Training

In [77]:
def train_model(model, train_loader, optimizer, criterion, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        print("Epoch", epoch)
        for batch in train_loader:
            optimizer.zero_grad()
            predictions = model(batch['user_idx'], batch['book_idx'], batch['book_meta'])
            loss = criterion(predictions, batch['rating'])
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

### Initialize Model

In [None]:
EMBEDDING_DIM = 64
METADATA_DIM = books_meta_clean.shape[1]
N_EPOCHS = 10

num_users = len(user_encoder.classes_) 
num_books = len(book_encoder.classes_)

model = NCF(num_users, num_books, EMBEDDING_DIM, METADATA_DIM)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [79]:
train_model(model, train_loader, optimizer, criterion, N_EPOCHS)

Epoch 0
Epoch 1, Loss: 1.314606785774231
Epoch 1
Epoch 2, Loss: 2.932584285736084
Epoch 2
Epoch 3, Loss: 3.3932583332061768
Epoch 3
Epoch 4, Loss: 3.0
Epoch 4
Epoch 5, Loss: 3.6853933334350586
Epoch 5
Epoch 6, Loss: 2.83146071434021
Epoch 6
Epoch 7, Loss: 2.1910111904144287
Epoch 7
Epoch 8, Loss: 2.561797857284546
Epoch 8
Epoch 9, Loss: 3.1123595237731934
Epoch 9
Epoch 10, Loss: 1.8876404762268066


## Evaluation

In [80]:
test_data.head(2)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment
0,0,343002,5,2012-01-25 00:26:06+00:00,,4641,0.574139
1,0,1852,4,2012-01-21 18:36:06+00:00,,4644,0.527973


In [81]:
test_data['user_idx'] = user_encoder.transform(test_data['user_id'])
test_data['book_idx'] = book_encoder.transform(test_data['book_id'])

test_dataset = BooksDataset(test_data, books_meta_clean) 

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [82]:
# Evaluation
model.eval()
test_loss = 0
with torch.no_grad():
    total_loss = 0
    for batch in test_loader:
        user_idx = batch['user_idx']
        book_idx = batch['book_idx']
        rating = batch['rating']
        book_metadata = batch['book_meta']

        outputs = model(user_idx, book_idx, book_metadata)
        loss = criterion(outputs, rating)

        total_loss += loss.item()

    test_loss = total_loss / len(test_data)
    print(f'Test Loss: {test_loss}')

Test Loss: 0.0037584065164393077


### Prediction of ratings for 2 books in test set

- NDCG could be used to measure the accuracy of the model's ranking if a value (probability that user will read a book or rating) for all unread books.
- Decided not to use NDCG here as there are only 2 books in the test set and even randomly generating would give 50% accuracy score, thus, wanted to use a metric that would calculate the accuracy of the actual predicted rating value rather than the rank.

#### Generate Predictions

In [83]:
def generate_predictions(model, data_loader, books_meta_clean):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            user_idx = batch['user_idx']
            book_idx = batch['book_idx']
            meta_data = batch['book_meta']
            outputs = model(user_idx, book_idx, meta_data)
            predictions.extend(zip(user_idx.numpy(), book_idx.numpy(), outputs.numpy()))
    return predictions

test_predictions = generate_predictions(model, test_loader, books_meta_clean)

In [84]:
test_data.head(4)

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment,user_idx,book_idx
0,0,343002,5,2012-01-25 00:26:06+00:00,,4641,0.574139,0,6509
1,0,1852,4,2012-01-21 18:36:06+00:00,,4644,0.527973,0,20
2,1,1248128,4,2009-05-04 15:32:21+00:00,,5636,0.527973,1,19267
3,1,30119,4,2009-05-04 15:11:22+00:00,,5636,0.527973,1,777


In [85]:
test_predictions

[(0, 6509, 5.0),
 (0, 20, 5.0),
 (1, 19267, 5.0),
 (1, 777, 5.0),
 (2, 1627, 5.0),
 (2, 28216, 5.0),
 (3, 777, 5.0),
 (3, 3330, 5.0),
 (4, 333, 5.0),
 (4, 3330, 5.0),
 (5, 133, 5.0),
 (5, 127, 5.0),
 (6, 776, 5.0),
 (6, 777, 5.0),
 (7, 0, 5.0),
 (7, 20, 5.0),
 (8, 13785, 5.0),
 (8, 0, 5.0),
 (9, 248, 5.0),
 (9, 777, 5.0),
 (10, 5669, 5.0),
 (10, 860, 5.0),
 (11, 3034, 5.0),
 (11, 0, 5.0),
 (12, 3034, 5.0),
 (12, 45, 5.0),
 (13, 482, 5.0),
 (13, 602, 5.0),
 (14, 6931, 5.0),
 (14, 777, 5.0),
 (15, 2648, 5.0),
 (15, 46073, 5.0),
 (16, 553, 5.0),
 (16, 1172, 5.0),
 (17, 570, 5.0),
 (17, 6931, 5.0),
 (18, 7269, 5.0),
 (18, 50592, 5.0),
 (19, 614, 5.0),
 (19, 310, 5.0),
 (20, 1172, 5.0),
 (20, 0, 5.0),
 (21, 654, 5.0),
 (21, 20387, 5.0),
 (22, 952, 5.0),
 (22, 280, 5.0),
 (23, 53694, 5.0),
 (23, 32073, 5.0),
 (24, 505, 5.0),
 (24, 602, 5.0),
 (25, 37479, 5.0),
 (25, 31717, 5.0),
 (26, 62, 5.0),
 (26, 40, 5.0),
 (27, 952, 5.0),
 (27, 40, 5.0),
 (28, 6931, 5.0),
 (28, 3330, 5.0),
 (29, 33271, 

In [86]:
test_predictions_df = pd.DataFrame(test_predictions, columns=['user_idx', 'book_idx', 'rating'])
test_predictions_df.head(4)

Unnamed: 0,user_idx,book_idx,rating
0,0,6509,5.0
1,0,20,5.0
2,1,19267,5.0
3,1,777,5.0


In [87]:
test_predictions_df[test_predictions_df['rating'] != 5]

Unnamed: 0,user_idx,book_idx,rating


In [88]:
def dcg_at_k(ratings, k):
    """Calculate DCG for a list of ratings at rank k."""
    ratings = np.asfarray(ratings)[:k]
    if ratings.size:
        return np.sum(ratings / np.log2(np.arange(2, ratings.size + 2)))
    return 0.0

def ndcg_at_k(actual_ratings, predicted_ratings, k):
    """Calculate NDCG at k."""
    ideal_ratings = sorted(actual_ratings, reverse=True)
    dcg_max = dcg_at_k(ideal_ratings, k)
    dcg_pred = dcg_at_k(predicted_ratings, k)
    return dcg_pred / dcg_max if dcg_max > 0 else 0

def weighted_ndcg(actual_ratings, predicted_ratings, k, weight=0.5):
    """Calculate Weighted NDCG by adjusting with MAE or RMSE error penalty."""
    # Step 1: Calculate standard NDCG
    ndcg = ndcg_at_k(actual_ratings, predicted_ratings, k)
    
    # Step 2: Calculate error penalty (MAE in this case)
    errors = [abs(a - p) for a, p in zip(actual_ratings, predicted_ratings)]
    mae = np.mean(errors)
    
    # Step 3: Adjust NDCG by penalty
    # Reduce NDCG proportionally by the weighted error term
    weighted_ndcg = ndcg - (weight * mae)
    return max(0, weighted_ndcg)  # Ensure the score doesn't go below 0

k = 2                               # Rank level for NDCG calculation
weight = 0.5                        # Weight for the MAE penalty (adjustable)

weighted_ndcg_score = weighted_ndcg(test_data['rating'], test_predictions_df['rating'], k, weight)
print("Weighted NDCG:", weighted_ndcg_score)


Weighted NDCG: 0.5646215982305991


In [None]:
# Convert test data to DataFrame for easier manipulation
test_data_df = pd.DataFrame(test_data, columns=['user_idx', 'book_idx', 'rating'])

# Map actual ratings to relevance scores
test_data_df['relevance'] = test_data_df['rating']

In [90]:
def dcg_at_k(relevance_scores, k):
    """
    Compute the Discounted Cumulative Gain (DCG) at rank position k.

    Parameters:
    - relevance_scores: List of relevance scores for items at the top-k positions
    - k: Rank position to compute DCG

    Returns:
    - DCG at rank k
    """
    relevance_scores = np.asarray(relevance_scores)[:k]
    if relevance_scores.size:
        return np.sum((2**relevance_scores - 1) / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.

def ndcg_at_k(relevance_scores, ideal_relevance_scores, k):
    """
    Compute the Normalized Discounted Cumulative Gain (NDCG) at rank position k.

    Parameters:
    - relevance_scores: List of relevance scores for the predicted ranking
    - ideal_relevance_scores: List of relevance scores for the ideal ranking
    - k: Rank position to compute NDCG

    Returns:
    - NDCG at rank k
    """
    dcg = dcg_at_k(relevance_scores, k)
    idcg = dcg_at_k(ideal_relevance_scores, k)
    if idcg == 0:
        return 0.
    return dcg / idcg


In [91]:
def compute_ndcg_for_user(user_idx, predictions_df, k=10):
    user_predictions = predictions_df[predictions_df['user_idx'] == user_idx]
    user_predictions = user_predictions.sort_values(by='predicted_rating', ascending=False)

    # Get relevance scores in the predicted order
    relevance_scores = user_predictions['relevance'].tolist()

    # Ideal relevance scores
    ideal_relevance_scores = sorted(relevance_scores, reverse=True)

    return ndcg_at_k(relevance_scores, ideal_relevance_scores, k)


def compute_overall_ndcg(test_data_df, predictions, k=10):
    predictions_df = pd.DataFrame(predictions, columns=['user_idx', 'book_idx', 'predicted_rating'])
    merged_df = pd.merge(test_data_df[['user_idx', 'book_idx', 'relevance']], predictions_df, on=['user_idx', 'book_idx'], how='inner')

    if len(merged_df) < len(test_data_df):
        print(f"Warning: Some test data points ({len(test_data_df) - len(merged_df)}) could not be matched with predictions.")

    user_idxs = merged_df['user_idx'].unique()
    ndcg_scores = [compute_ndcg_for_user(user_idx, merged_df, k) for user_idx in user_idxs]

    return np.mean(ndcg_scores)

# Compute NDCG for test set
overall_ndcg = compute_overall_ndcg(test_data_df, test_predictions)
print(f"Overall NDCG@10: {overall_ndcg}")

Overall NDCG@10: 0.9362536255193167


### Prediction for ALL unread books

#### If we were to predict a probability that a user will read a book / a rating for <b> ALL unread books </b>

- Did not follow through with this as our datasets are very big, with the books dataset having almost 60k unique books and almost 21k unique users which is already scaled down through our sampling efforts.
- Each sampled user has around 5-15 interactions in the training set and test sets combined, with 2 of each user's interactions being in the test set, leaving around 3-13 interactions in the training set.
- Thus, will take a long time to predict the 50k+ unread books for each of the 21k users. I tried scaling it down to predict for ~1000 users and also using a larger batch size of 1024, but 1 batch took around 0.3s and in total, the number of data from the combinations of user-book pairings for all the unread books for each user still totaled to 62,214,416 which would essentially take at least 5h to run (in addition to the memory space required), and I would still need to tune hyperparameters.

But nevertheless, the code below is the attempt to sample and run the predictions for around 1000 randomly sampled users. The activation function of the last layer of the NCF model can be altered to either predict a probability that a user will read a book or rating that the user will give. 

#### Defining Test Data
Test set is to contain all user-book pairings of books that user has not read (any user-book pairing not in the training set)

In [92]:
train_data['user_idx'].nunique()

20798

In [93]:
train_data['user_idx'].nunique() * 0.05

1039.9

There are almost 21,000 unique users in the training set. Thus, we will randomly sample 5% of users to generate predictions and evaluate the model (around 1000 people).

In [94]:
# # Create test data by excluding books that user has already rated
# def get_unread_books_for_user(user_idx, train_data, all_books):
#     # Get the books the user has already interacted with (from train_data)
#     books_read_by_user = train_data[train_data['user_idx'] == user_idx]['book_idx']

#     # Get the books in books_meta_clean that the user has not read
#     unread_books = books_meta_clean[~books_meta_clean['book_idx'].isin(books_read_by_user)]
#     return np.transpose(unread_books['book_idx'])

# # Randomly sample users (can try to increase frac if there's time to run)
# sampled_user_idx = pd.Series(train_data['user_idx'].unique()).sample(frac=0.05, random_state=42)

In [95]:
# # List to store test data
# X_test = []

# # Loop through each user and add their unread books to the test data
# for user_idx in sampled_user_idx:
#     unread_books = get_unread_books_for_user(user_idx, train_data, books_meta_clean)
#     # Create rows for each unread book
#     for book_idx in unread_books:
#         X_test.append({'user_idx': user_idx, 'book_idx': book_idx})

In [96]:
# Create a DataFrame from the list of dictionaries
# X_test = pd.DataFrame(X_test)

In [97]:
# Save X_test so that I don't have to run the above again
# X_test.to_csv('X_test.csv', index=False)

In [None]:
X_test = pd.read_csv('X_test.csv')

In [None]:
print(X_test.shape)
X_test.head(10)

(62214416, 2)


Unnamed: 0,user_idx,book_idx
0,1346,24217
1,1346,3475
2,1346,37931
3,1346,52943
4,1346,51907
5,1346,31721
6,1346,28544
7,1346,59543
8,1346,50622
9,1346,35143


The BooksDataset would have to be defined differently for this approach to not take ratings as an input for testing as the user-unread book pairings would not have an actual rating.

In [None]:
class BooksDataset(Dataset):
    def __init__(self, data, book_features, is_train=True):
        self.data = data
        self.book_features = book_features

        # Flag to indicate if this is training or testing
        self.is_train = is_train
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        user_idx = row['user_idx']
        book_idx = row['book_idx']
        
        # Extract book metadata
        book_meta = torch.tensor(self.book_features.loc[book_idx].values.astype(np.float32), dtype=torch.float)
        
        # Prepare the data dictionary
        data_dict = {
            'user_idx': torch.tensor(user_idx, dtype=torch.long),
            'book_idx': torch.tensor(book_idx, dtype=torch.long),
            'book_meta': book_meta
        }
        
        if self.is_train:
            # For testing, rating is not required
            data_dict['rating'] = torch.tensor(row['rating'], dtype=torch.float)
        
        return data_dict

In [None]:
# is_train is set to false such that ratings for each user-unread book pair does not have to be inputed during testing
test_dataset = BooksDataset(X_test, books_meta_clean, is_train=False) 

test_loader = DataLoader(test_dataset, batch_size=1024) # Setting higher batch size than train_loader to speed up predictions

In [None]:
test_dataset = BooksDataset(test_data, books_meta_clean, is_train=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
def generate_predictions(model, data_loader, books_meta_clean):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in data_loader:
            user_idx = batch['user_idx']
            book_idx = batch['book_idx']
            meta_data = batch['book_meta']
            outputs = model(user_idx, book_idx, meta_data)
            predictions.extend(zip(user_idx.numpy(), book_idx.numpy(), outputs.numpy()))
            break
    return predictions

test_predictions = generate_predictions(model, test_loader, books_meta_clean)

In [None]:
print(test_predictions)

[(0, 6509, 5.0), (0, 20, 5.0), (1, 19267, 5.0), (1, 777, 5.0), (2, 1627, 5.0), (2, 28216, 5.0), (3, 777, 5.0), (3, 3330, 5.0), (4, 333, 5.0), (4, 3330, 5.0), (5, 133, 5.0), (5, 127, 5.0), (6, 776, 5.0), (6, 777, 5.0), (7, 0, 5.0), (7, 20, 5.0), (8, 13785, 5.0), (8, 0, 5.0), (9, 248, 5.0), (9, 777, 5.0), (10, 5669, 5.0), (10, 860, 5.0), (11, 3034, 5.0), (11, 0, 5.0), (12, 3034, 5.0), (12, 45, 5.0), (13, 482, 5.0), (13, 602, 5.0), (14, 6931, 5.0), (14, 777, 5.0), (15, 2648, 5.0), (15, 46073, 5.0), (16, 553, 5.0), (16, 1172, 5.0), (17, 570, 5.0), (17, 6931, 5.0), (18, 7269, 5.0), (18, 50592, 5.0), (19, 614, 5.0), (19, 310, 5.0), (20, 1172, 5.0), (20, 0, 5.0), (21, 654, 5.0), (21, 20387, 5.0), (22, 952, 5.0), (22, 280, 5.0), (23, 53694, 5.0), (23, 32073, 5.0), (24, 505, 5.0), (24, 602, 5.0), (25, 37479, 5.0), (25, 31717, 5.0), (26, 62, 5.0), (26, 40, 5.0), (27, 952, 5.0), (27, 40, 5.0), (28, 6931, 5.0), (28, 3330, 5.0), (29, 33271, 5.0), (29, 6567, 5.0), (30, 8171, 5.0), (30, 3221, 5.0), (

In [None]:
test_data[test_data['user_idx'] == 1346]

Unnamed: 0,user_id,book_id,rating,date_added,n_votes,review_age,sentiment,user_idx,book_idx
2692,1346,5,5,2015-08-08 21:08:15+00:00,,3349,0.574139,1346,0
2693,1346,1327679,3,2015-08-08 18:40:20+00:00,,3349,0.40686,1346,20146


#### Recommendation Method

If we were to generate a probability / predicted rating for all unread books by each user, we could group by the user idx and sort based on the predicted probability that the user would read the book / predicted rating and retrieve the top k (10) books to recommend.

In [None]:
# Convert to DataFrame
df = pd.DataFrame(test_predictions, columns=['user_idx', 'book_idx', 'rating'])

# Group by user_idx and retrieve the top 10 books for each user, sorted by rating
top_10_books_per_user = (
    df.groupby('user_idx')
      .apply(lambda x: x.nlargest(10, 'rating'))
      .reset_index(drop=True)
)

top_10_books_per_user.head(20)

  .apply(lambda x: x.nlargest(10, 'rating'))


Unnamed: 0,user_idx,book_idx,rating
0,0,6509,5.0
1,0,20,5.0
2,1,19267,5.0
3,1,777,5.0
4,2,1627,5.0
5,2,28216,5.0
6,3,777,5.0
7,3,3330,5.0
8,4,333,5.0
9,4,3330,5.0


## Attempted to use multiple CPU cores to process in parallel

In [None]:
import multiprocessing 
from multiprocessing import Pool
Num_core = multiprocessing.cpu_count()
Num_core

8

In [None]:
def process_batch(model, batch):
    """Process a single batch for prediction."""
    model.eval()
    user_idx = batch['user_idx']
    book_idx = batch['book_idx']
    meta_data = batch['book_meta']
    
    # Perform the prediction for this batch
    with torch.no_grad():
        outputs = model(user_idx, book_idx, meta_data)
    
    # Return the predictions as a list of tuples
    return list(zip(user_idx.numpy(), book_idx.numpy(), outputs.numpy()))

def parallel_generate_predictions(model, data_loader, num_processes):
    """Generate predictions in parallel using multiple processes."""
    
    # Split data_loader into chunks (you can adjust chunk size depending on your system's capacity)
    data_chunks = [list(data_loader)[i::num_processes] for i in range(num_processes)]
    
    # Create a pool of processes and map the data chunks to processes
    with Pool(processes=num_processes) as pool:
        # Pass the model and data chunk to each worker process
        results = pool.starmap(process_batch, [(model, chunk) for chunk in data_chunks])
    
    # Flatten the list of results and return all predictions
    return [item for sublist in results for item in sublist]

# Set the number of CPU cores to use
num_cores = multiprocessing.cpu_count()  # Example: 8 CPUs

# Generate predictions using multiprocessing
# test_predictions = parallel_generate_predictions(model, test_loader, num_cores)
