In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data


In [2]:

# Load datasets
books = pd.read_csv('data/Books.csv')
ratings = pd.read_csv('data/Ratings.csv')
users = pd.read_csv('data/Users.csv')

# Encode User-ID and ISBN to numerical values
user_encoder = LabelEncoder()
book_encoder = LabelEncoder()


  books = pd.read_csv('data/Books.csv')


In [3]:

ratings['User-ID'] = user_encoder.fit_transform(ratings['User-ID'])
ratings['ISBN'] = book_encoder.fit_transform(ratings['ISBN'])

# Create node feature matrices (dummy features for now)
num_users = ratings['User-ID'].max() + 1
num_books = ratings['ISBN'].max() + 1

# Features: Initialize dummy features (1 for all nodes)
user_features = torch.ones((num_users, 1))
book_features = torch.ones((num_books, 1))


# Combine user and book features
x = torch.cat([user_features, book_features], dim=0)

# Create edge index and edge attributes (ratings as weights)
user_indices = torch.tensor(ratings['User-ID'].values, dtype=torch.long)
book_indices = torch.tensor(ratings['ISBN'].values + num_users, dtype=torch.long)  # Offset book indices
edge_index = torch.stack([user_indices, book_indices], dim=0)

# Edge weights: Ratings normalized between 0 and 1
edge_attr = torch.tensor(ratings['Book-Rating'].values / ratings['Book-Rating'].max(), dtype=torch.float)

# Build PyTorch Geometric graph
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
print(data)


Data(x=[445839, 1], edge_index=[2, 1149780], edge_attr=[1149780])


In [None]:
import torch.nn as nn
from torch_geometric.nn import GCNConv

class BookGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BookGNN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_weight=edge_attr)
        x = torch.relu(x)
        x = self.conv2(x, edge_index, edge_weight=edge_attr)
        return x


In [5]:
import torch

# Define the model, loss, and optimizer
input_dim = 1  # Initial feature size
hidden_dim = 64
output_dim = 16  # Embedding size

# Load the model
model = BookGNN(input_dim, hidden_dim, output_dim)
model.load_state_dict(torch.load('models/book_gnn_model-1731667954.pth'))
model.eval()

  model.load_state_dict(torch.load('models/book_gnn_model-1731667954.pth'))


BookGNN(
  (conv1): GCNConv(1, 64)
  (conv2): GCNConv(64, 16)
)

In [13]:
# Assuming you have a test dataset similar to the training dataset
test_ratings = pd.read_csv('data/TestRatings.csv')

# Convert User-ID and ISBN to strings before combining
ratings['User-ID'] = ratings['User-ID'].astype(str)
ratings['ISBN'] = ratings['ISBN'].astype(str)
test_ratings['User-ID'] = test_ratings['User-ID'].astype(str)
test_ratings['ISBN'] = test_ratings['ISBN'].astype(str)

# Combine training and test data for encoding
combined_user_ids = pd.concat([ratings['User-ID'], test_ratings['User-ID']])
combined_isbns = pd.concat([ratings['ISBN'], test_ratings['ISBN']])

# Fit the encoders on the combined data
user_encoder.fit(combined_user_ids)
book_encoder.fit(combined_isbns)

# Encode User-ID and ISBN to numerical values using the same encoders
test_ratings['User-ID'] = user_encoder.transform(test_ratings['User-ID'])
test_ratings['ISBN'] = book_encoder.transform(test_ratings['ISBN'])

# Create edge index and edge attributes for the test data
test_user_indices = torch.tensor(test_ratings['User-ID'].values, dtype=torch.long)
test_book_indices = torch.tensor(test_ratings['ISBN'].values + num_users, dtype=torch.long)  # Offset book indices
test_edge_index = torch.stack([test_user_indices, test_book_indices], dim=0)
test_edge_attr = torch.tensor(test_ratings['Book-Rating'].values / test_ratings['Book-Rating'].max(), dtype=torch.float)

In [14]:
with torch.no_grad():
    # Forward pass
    out = model(data.x, data.edge_index, data.edge_attr)
    
    # Get embeddings for test edges
    test_user_embeddings = out[test_edge_index[0]]
    test_book_embeddings = out[test_edge_index[1]]
    
    # Predict ratings
    predicted_ratings = torch.sum(test_user_embeddings * test_book_embeddings, dim=1)

In [15]:
from sklearn.metrics import mean_squared_error

# Compute the mean squared error between predicted and actual ratings
mse = mean_squared_error(test_edge_attr.numpy(), predicted_ratings.numpy())
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.405109703540802


In [24]:
# Example: Get the original user ID from the encoded user ID
encoded_user_id = 123  # This is the numerical ID used in the model
original_user_id = user_encoder.inverse_transform([encoded_user_id])[0]
print(f'Original User ID: {original_user_id}')

Original User ID: 100106


In [31]:
def recommend_books(user_id, top_n=10):
    user_id_str = str(user_id)
    if user_id_str not in user_encoder.classes_:
        print(f"User ID {user_id} not found in the encoder.")
        return []
    
    user_node = user_encoder.transform([user_id_str])[0]
    user_embedding = out[user_node].unsqueeze(0)
    
    # Compute similarity scores with all book embeddings
    scores = torch.matmul(user_embedding, out[num_users:].t()).squeeze()
    
    # Get top N book indices
    _, top_book_indices = torch.topk(scores, top_n)
    
    # Decode book indices to original ISBNs
    recommended_books = book_encoder.inverse_transform(top_book_indices.numpy())
    return recommended_books

# Example: Recommend top 10 books for user with ID 99998
recommended_books = recommend_books(99998, top_n=10)
print(recommended_books)

['142626' '175015' '1369' '250026' '229710' '12533' '32266' '108666'
 '197748' '207504']


In [29]:
print(user_encoder.classes_)

['0' '1' '10' ... '99997' '99998' '99999']


In [37]:
# Get data of books from the dataset based on the indices
books_data = books[books['ISBN'].isin(recommended_books)]

books_data

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L


In [38]:
import pandas as pd

# Load the book data
books = pd.read_csv('data/Books.csv')

  books = pd.read_csv('data/Books.csv')


In [43]:
def recommend_books(user_id, top_n=10):
    user_id_str = str(user_id)
    if user_id_str not in user_encoder.classes_:
        print(f"User ID {user_id} not found in the encoder.")
        return []
    
    user_node = user_encoder.transform([user_id_str])[0]
    user_embedding = out[user_node].unsqueeze(0)
    
    # Compute similarity scores with all book embeddings
    scores = torch.matmul(user_embedding, out[len(users):].t()).squeeze()
    
    # Get top N book indices
    _, top_book_indices = torch.topk(scores, top_n)
    
    # Decode book indices to original ISBNs
    recommended_books = book_encoder.inverse_transform(top_book_indices.numpy())
    return recommended_books

In [44]:
# Example: Recommend top 10 books for user with ID 99998
recommended_books_isbns = recommend_books(99998, top_n=10)

# Print the recommended ISBNs
print("Recommended ISBNs:", recommended_books_isbns)

# Filter the book data to include only the recommended books
recommended_books_data = books[books['ISBN'].isin(recommended_books_isbns)]

# Print the filtered book data
print(recommended_books_data)

Recommended ISBNs: ['166442' '121610' '107673' '125814' '167130' '140630' '127165' '172344'
 '132100' '107078']
Empty DataFrame
Columns: [ISBN, Book-Title, Book-Author, Year-Of-Publication, Publisher, Image-URL-S, Image-URL-M, Image-URL-L]
Index: []


In [63]:
result = []
for row_num in recommended_books_isbns:
    print(int(row_num))
    result.append(books.loc[int(row_num)].values)
    

166442
121610
107673
125814
167130
140630
127165
172344
132100
107078


In [57]:
# Find a book by its row number
book_row = 167130

# Get the ISBN of the book
book_isbn = books.loc[book_row].values
book_isbn

array(['0865475156',
       "Last Night's Fun: In and Out of Time With Irish Music",
       'Ciaran Carson', 1997, 'Farrar Straus &amp; Giroux',
       'http://images.amazon.com/images/P/0865475156.01.THUMBZZZ.jpg',
       'http://images.amazon.com/images/P/0865475156.01.MZZZZZZZ.jpg',
       'http://images.amazon.com/images/P/0865475156.01.LZZZZZZZ.jpg'],
      dtype=object)

In [64]:
result

[array(['0374503168', 'Christ Stopped at Eboli: The Story of a Year',
        'Carlo Levi', 1995, 'Farrar Straus Giroux',
        'http://images.amazon.com/images/P/0374503168.01.THUMBZZZ.jpg',
        'http://images.amazon.com/images/P/0374503168.01.MZZZZZZZ.jpg',
        'http://images.amazon.com/images/P/0374503168.01.LZZZZZZZ.jpg'],
       dtype=object),
 array(['0061063223', 'Slime Time (BC 10) (Bone Chillers)', 'Betsy Haynes',
        1996, 'HarperTorch',
        'http://images.amazon.com/images/P/0061063223.01.THUMBZZZ.jpg',
        'http://images.amazon.com/images/P/0061063223.01.MZZZZZZZ.jpg',
        'http://images.amazon.com/images/P/0061063223.01.LZZZZZZZ.jpg'],
       dtype=object),
 array(['0933635516',
        'Cthulhu Now: Modern Adventures and Background for Call of Cthulhu Roleplaying/3307',
        'William A. Barton', 1992, 'Chaosium',
        'http://images.amazon.com/images/P/0933635516.01.THUMBZZZ.jpg',
        'http://images.amazon.com/images/P/0933635516.01.MZZ

In [None]:
# Get data 