In [2]:
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import joblib
import pandas as pd
import scipy.sparse as sp
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


cleaned_df = pd.read_csv('Model Files/cleaned_df.csv')
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

In [3]:
def load_sparse_matrix_to_tensor(path):
    # Load the sparse matrix from disk
    sparse_matrix = sp.load_npz(path)
    
    # Convert the sparse matrix to a dense NumPy array
    dense_array = sparse_matrix.toarray()
    
    # Convert the dense NumPy array to a PyTorch tensor
    tensor = torch.tensor(dense_array, dtype=torch.float)
    
    return tensor


In [4]:
class CollabFiltModel(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
    
    def forward(self, user, item):
        user_emb = self.user_emb(user)
        item_emb = self.item_emb(item)
        return (user_emb * item_emb).sum(1)


In [5]:
# Initialize the model (make sure it has the same architecture)
loaded_model = CollabFiltModel(num_users=cleaned_df['User_id'].nunique(),
                               num_items=cleaned_df['Title'].nunique()).to(device)

# Load the model state dictionary
loaded_model.load_state_dict(torch.load('Model Files/collab_filt_model_state_dict.pth'))

# Ensure to switch the model to evaluation mode
loaded_model.eval()

# Load the encoders
user_encoder = joblib.load('Model Files/user_encoder.joblib')
item_encoder = joblib.load('Model Files/item_encoder.joblib')

In [16]:
loaded_model.eval()
loaded_model.to('cpu') ## Faster

# Extract item embeddings
item_embeddings = loaded_model.item_emb.weight.data.cpu().numpy()

# Filter genres to only include books with more than 20,000 examples
filtered_df = cleaned_df[cleaned_df['categories'].isin(cleaned_df['categories'].value_counts()[cleaned_df['categories'].value_counts() > 20000].index)]

unique_genres = filtered_df['categories'].unique()

# Display the genres to the user
print("Please choose a genre from the following list:")
for i, genre in enumerate(unique_genres, 1):
    print(f"{i}. {genre}")
print("20. All genres")

choice = int(input("Enter the number corresponding to your choice: ")) - 1  # Subtract 1 to match the list index

# genre_choice = unique_genres[choice]
genre_choice = cleaned_df['categories'] if choice == 19 else unique_genres[choice]

Please choose a genre from the following list:
1. ['Biography & Autobiography']
2. ['Religion']
3. ['Fiction']
4. ['Social Science']
5. ['Juvenile Nonfiction']
6. ['History']
7. ['Political Science']
8. ['Health & Fitness']
9. ['Cooking']
10. ['Philosophy']
11. ['Sports & Recreation']
12. ['Body, Mind & Spirit']
13. ['Juvenile Fiction']
14. ['Family & Relationships']
15. ['Science']
16. ['Business & Economics']
17. ['Computers']
18. ['Self-Help']
19. ['Young Adult Fiction']
20. All genres
Enter the number corresponding to your choice: 5


In [17]:
unique_df = cleaned_df[cleaned_df['categories'] == genre_choice].drop_duplicates(subset=['Title', 'authors'])

sample_data = unique_df[['Title', 'authors']].sample(5)

sample_titles = sample_data['Title'].to_numpy()
sample_authors = sample_data['authors'].to_numpy()

decoded_titles = item_encoder.inverse_transform(sample_titles)

user_ratings = {}
print('Rate these books 1-5')
for title, author in zip(decoded_titles, sample_authors):
    score = input(f'{title} by {author}: ')
    
    encoded_value = item_encoder.transform([title])[0]
    
    user_ratings[encoded_value] = float(score)


Rate these books 1-5
Artistic Drawing (Creative Kids) by ['Jeanette Nyberg']: 1
Kidding Around Kansas City by ['Lisa Harkrader', 'Suzanne Lieurance']: 4
Italian Immigrants, 1880-1920 (Blue Earth Books: Coming to America) by ['Anne M. Todd']: 1
Faith Facts for Young Catholics: Drills, Games and Activities for Middle School Students by ['Kieran Sawyer']: 4
A Change of Flag by ['Martha E. H. Rustad']: 2


In [30]:
# Prepare the data for ridge regression
rated_item_indices = list(user_ratings.keys())
X = item_embeddings[rated_item_indices]
y = np.array(list(user_ratings.values()))

# Fit the ridge regression model
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X, y)

# The user's "embedding" is approximated by the coefficients
user_preferences = ridge_model.coef_


In [31]:
# Predict ratings for all items
predicted_ratings = np.dot(item_embeddings, user_preferences)

# Rank items by predicted rating, excluding already rated items
recommended_indices = np.argsort(-predicted_ratings)
top_recommendations = [index for index in recommended_indices if index not in rated_item_indices][:5]

# Decode the top recommended item indices to original IDs
top_recommended_item_ids = item_encoder.inverse_transform(top_recommendations)

print("We recommend these 5 books based on your ratings: ")

for i, book in enumerate(top_recommended_item_ids):
    print(f"{i+1}. {book}")

We recommend these 5 books based on your ratings: 
1. Modern Bank Fishing
2. The Porsche 924/944 Book (Foulis Motoring Book)
3. Outlooks & insights on the weekly Torah portion (ArtScroll Mesorah series)
4. The Painting in the Attic
5. Baedeker's Austria


In [32]:
# Target book to find similarities
input_title = "Modern Bank Fishing"

movie_index = item_encoder.transform([input_title])[0]

target_movie_vector = item_embeddings[movie_index].reshape(1, -1)

cosine_similarities = cosine_similarity(item_embeddings, target_movie_vector).flatten()

top_indices = np.argsort(-cosine_similarities)[1:11] 


index_to_movie = {i: label for i, label in enumerate(item_encoder.classes_)}

# Get the titles of the top 10 similar movies
top_movie_titles = [index_to_movie[i] for i in top_indices]

print("Top 10 similar movies:")
for i, title in enumerate(top_movie_titles):
    print(f"{i+1}. {title}")


Top 10 similar movies:
1. Beast Rising
2. Escape from Babel: Toward a Unifying Language for Psychotherapy Practice (Norton Professional Books)
3. Horseman's Club
4. The tragedy of Hamlet (The student's Shakespeare)
5. Adolescent Development Student Sg
6. 103 Hikes in Southwestern British Columbia
7. The Spiritual Guide of Miguel Molinos
8. Microeconomics
9. Planets
10. The Privateersman


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a ridge regression model
ridge_reg = Ridge(alpha=1.0)  # alpha is the regularization strength
ridge_reg.fit(X_train, y_train)

# Predict on the testing set
y_pred = ridge_reg.predict(X_test)

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE: {mse}")

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f"RMSE: {rmse}")

# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae}")

MSE: 2.4519094298013897
RMSE: 1.5658574104309082
MAE: 1.5658574104309082
