In [1]:
base_path = r"D:/Work/Hackathon/AI_ML/Dev/Dataset/D_new/"
model_route = r"D:/Work/Hackathon/AI_ML/Dev/Models/"

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm.notebook import tqdm
import re
import numpy as np
import random


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
merged_dataframe_pth= model_route + "merged_dataframe_with_sentiment_labels.csv"
merged_df= pd.read_csv(merged_dataframe_pth)
# merged_df=merged_df.fillna('')
merged_df = merged_df.dropna()

In [4]:
merged_df.columns

Index(['Id', 'Title', 'Price', 'User_id', 'review/helpfulness', 'review/score',
       'review/time', 'review/summary', 'review/text', 'description',
       'authors', 'image', 'publisher', 'publishedDate', 'categories',
       'ratingsCount', 'sentiment_score', 'sentiment_label'],
      dtype='object')

In [5]:
# merged_df.shape --- 150000

In [6]:
# merged_df[['Title','review/helpfulness','publishedDate']][:50]

In [7]:
# for i in merged_df["review/helpfulness"]:
#   if "/" not in i:
#     print(i)

In [8]:
# merged_df[['sentiment_score','review/text','sentiment_label']][:50]

In [9]:
# merged_df.isnull().sum()

In [10]:
# Create a mapping of user and item indices
user_ids = merged_df['User_id'].unique()
item_ids = merged_df['Title'].unique()
user_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
item_to_index = {item_id: idx for idx, item_id in enumerate(item_ids)}
index_to_title = {idx: title for title, idx in item_to_index.items()}

# Convert user and item indices to tensors
user_indices = torch.tensor([user_to_index[user_id] for user_id in merged_df['User_id']], dtype=torch.long)
item_indices = torch.tensor([item_to_index[item_id] for item_id in merged_df['Title']], dtype=torch.long)
ratings = torch.tensor(merged_df['review/score'].values, dtype=torch.float32)

In [11]:
class CollaborativeFilteringModel(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.hidden_layer = nn.Linear(embedding_dim * 2, hidden_dim)
        self.relu = nn.ReLU()
        self.output_layer = nn.Linear(hidden_dim, 1)

    def forward(self, user_indices, item_indices):
        user_embedded = self.user_embedding(user_indices)
        item_embedded = self.item_embedding(item_indices)
        concatenated = torch.cat([user_embedded, item_embedded], dim=1)
        hidden_output = self.relu(self.hidden_layer(concatenated))
        output = self.output_layer(hidden_output)
        return output

    def get_similar_titles(self, input_title_index, top_k=100):
        device = self.item_embedding.weight.device  # Get the device of the embeddings

        # Move the input title index to the same device as the model
        input_title_index = torch.tensor([input_title_index], device=device)

        # Get the embedding for the input title
        input_title_embedding = self.item_embedding(input_title_index)

        # Get embeddings for all titles
        all_title_embeddings = self.item_embedding.weight

        # Calculate cosine similarity
        similarities = F.cosine_similarity(input_title_embedding, all_title_embeddings)

        # Get indices of top-k similar titles
        #argsort returns the indices that sort a tensor along a given dimension in ascending order(default) by value.
        similar_title_indices = torch.argsort(similarities, descending=True)[:top_k]

        # Convert indices to a list of titles
        similar_titles = [index_to_title[idx.item()] for idx in similar_title_indices]
        #we are using item() to get scalar value instead of tensor which can be used as an key index for dictionary
        return similar_titles

In [12]:
model_path = model_route + "collaborative_filtering_model.pth"
# Load the entire model
model_loaded = torch.load(model_path, map_location=device)
model_loaded.to(device)

CollaborativeFilteringModel(
  (user_embedding): Embedding(118335, 100)
  (item_embedding): Embedding(36537, 100)
  (hidden_layer): Linear(in_features=200, out_features=32, bias=True)
  (relu): ReLU()
  (output_layer): Linear(in_features=32, out_features=1, bias=True)
)

In [13]:
model_loaded.parameters

<bound method Module.parameters of CollaborativeFilteringModel(
  (user_embedding): Embedding(118335, 100)
  (item_embedding): Embedding(36537, 100)
  (hidden_layer): Linear(in_features=200, out_features=32, bias=True)
  (relu): ReLU()
  (output_layer): Linear(in_features=32, out_features=1, bias=True)
)>

In [14]:
user_index = torch.tensor([user_to_index['A30TK6U7DNS82R']], dtype=torch.long)
item_index = torch.tensor([item_to_index['The Insiders (Insiders (Bloomsbury))']], dtype=torch.long)
user_index , item_index= user_index.to(device), item_index.to(device)
prediction = model_loaded(user_index, item_index).item()
print(f'Predicted Rating: {prediction:.4f}')

Predicted Rating: 4.3365


In [15]:
# Define the content-based filtering model
class ContentBasedFilteringModel(nn.Module):
    def __init__(self, num_categories, num_authors, num_titles, embedding_dim):
        super(ContentBasedFilteringModel, self).__init__()
        self.category_embedding = nn.Embedding(num_categories, embedding_dim)
        self.author_embedding = nn.Embedding(num_authors, embedding_dim)
        self.title_embedding = nn.Embedding(num_titles, embedding_dim)
        self.sentiment_linear = nn.Linear(4 * embedding_dim, 1)

    def forward(self, category_indices, author_indices, title_indices, sentiment_scores):
        category_embedded = self.category_embedding(category_indices)
        author_embedded = self.author_embedding(author_indices)
        title_embedded = self.title_embedding(title_indices)
        sentiment_expanded = sentiment_scores.unsqueeze(1).expand_as(category_embedded)
        # It serves as a constant tensor that gets expanded to match the size of category_embedded for concatenation, and its values remain fixed throughout training
        #self.expand_as(other) is equivalent to self.expand(other.size()).

        concatenated = torch.cat([category_embedded, author_embedded, title_embedded, sentiment_expanded], dim=1)
        output = self.sentiment_linear(concatenated)
        return output

In [16]:
model_path = model_route + "content_based_filtering_model.pth"
# Load the entire model
cbf_model_loaded = torch.load(model_path, map_location=device)
cbf_model_loaded.to(device)

ContentBasedFilteringModel(
  (category_embedding): Embedding(1168, 64)
  (author_embedding): Embedding(28713, 64)
  (title_embedding): Embedding(36537, 64)
  (sentiment_linear): Linear(in_features=256, out_features=1, bias=True)
)

In [17]:
def get_collaborative_recommendations(model, title, num_recommendations=100):


    #item_to_index = {title: idx for idx, title in enumerate(item_ids)}
    # Get index of the input title
    input_title_index = item_to_index[title] # have already declared this before in above cells

    # Get recommendations using the collaborative filtering model
    model.eval()
    with torch.inference_mode():
        # Call the custom method to get similar titles
        similar_titles = model.get_similar_titles(input_title_index, top_k=num_recommendations)


    # Return the recommended titles
    return similar_titles


In [18]:
input_title = "From Potter's Field"
collab_recommendations = get_collaborative_recommendations(model_loaded, input_title,num_recommendations=1000)

In [19]:
collab_recommendations[:10]

["From Potter's Field",
 'Railroad Ferries of the Hudson and Stories of a Deck Hand',
 'Bad Dogs Need It: Good Dogs Deserve It: An Encyclopedia of Behavior Problems and Training Solutions',
 'Yamaha Outboard Shop Manual: 2-90 Hp Two-Stroke, 1999-2002 (Includes Jet Drives (Clymer Marine Repair)',
 'Hoax: The Inside Story of the Howard Hughes-Clifford Irving Affair',
 "Enough About You, Let's Talk About Me: How to Recognize and Manage the Narcissists in Your Life",
 'Dave Barry Does Japan',
 'A Search of African American Life, Achievement And Culture: First Search',
 'Lyre of Orpheus (G K Hall Large Print Book Series)',
 'The Line of Beauty: A Novel']

In [20]:
title_sentiment_aggregated = merged_df.groupby(['Title','authors','categories'])['sentiment_score'].mean().reset_index()

In [21]:
# Create a mapping of unique categories, authors, and titles
unique_categories = merged_df['categories'].unique()
unique_authors = merged_df['authors'].unique()
unique_titles = title_sentiment_aggregated['Title'].unique()

category_to_index = {category: idx for idx, category in enumerate(unique_categories)}
author_to_index = {author: idx for idx, author in enumerate(unique_authors)}
title_to_index = {title: idx for idx, title in enumerate(unique_titles)}

In [22]:
# Convert categories, authors, titles to tensors
category_indices = torch.tensor([category_to_index[category] for category in title_sentiment_aggregated['categories']], dtype=torch.long)
author_indices = torch.tensor([author_to_index[author] for author in title_sentiment_aggregated['authors']], dtype=torch.long)
title_indices = torch.tensor([title_to_index[title] for title in title_sentiment_aggregated['Title']], dtype=torch.long)

In [23]:
def get_content_based_recommendations(content_based_model, collaborative_recommendations):
    # Assuming you have a mapping of titles to details (categories, authors, sentiment_scores)
    #nested dictionary
    title_details = title_sentiment_aggregated.set_index('Title')[['categories', 'authors', 'sentiment_score']].to_dict(orient='index')

    # Extract details for collaborative recommendations
    details = [title_details[title] for title in collaborative_recommendations]

    # Convert details to tensors
    category_indices = torch.tensor([category_to_index[detail['categories']] for detail in details], dtype=torch.long)
    author_indices = torch.tensor([author_to_index[detail['authors']] for detail in details], dtype=torch.long)
    title_indices = torch.tensor([title_to_index[title] for title in collaborative_recommendations], dtype=torch.long)
    sentiment_scores = torch.tensor([detail['sentiment_score'] for detail in details], dtype=torch.float32)
    category_indices, author_indices, title_indices, sentiment_scores= category_indices.to(device), author_indices.to(device), title_indices.to(device), sentiment_scores.to(device)
    # Assuming you have a function to get predictions from the content-based model
    content_based_model.eval()
    with torch.inference_mode():
        predictions = content_based_model(category_indices, author_indices, title_indices, sentiment_scores)

    # Sort titles based on the predictions
    sorted_titles = [title for _, title in sorted(zip(predictions, collaborative_recommendations), reverse=True)]

    # Return the sorted titles
    return sorted_titles


In [24]:
r = get_content_based_recommendations(cbf_model_loaded, collab_recommendations)

In [25]:
r[:10]

['Encyclopedia of 7700 Illustrations',
 'North from Mexico: The Spanish-Speaking People of the United States',
 'Spontaneous Remission: An Annotated Bibliography',
 'Introduction to the Biology of Marine Life',
 'Higher Ground - A Novel in Three Parts',
 'Classic Wiley: A Lifetime of Punchers, Players, Punks and Prophets (Great American Sportswriters)',
 'The battle of Dienbienphu',
 'Morning, Noon and Night',
 'Dr. Frankenstein and World Systems',
 'Cathay: A Journey in Search of Old China (Destinations)']

In [26]:
def partial_name_matching(partial_name, all_books):
    matching_titles = [title for title in all_books if partial_name.lower() in title.lower()]
    unique_matching_titles = list(set(matching_titles))
    if len(unique_matching_titles) == 0:
        return "404", "Invalid !! "
    print("You probably searched for :", unique_matching_titles[0] )
    return unique_matching_titles[0]


In [27]:
i = partial_name_matching('harry potter',merged_df['Title'])
i

You probably searched for : An Unofficial Muggle's Guide to the Wizarding World: Exploring the Harry Potter Universe


"An Unofficial Muggle's Guide to the Wizarding World: Exploring the Harry Potter Universe"

In [28]:
def recommendation_system(title, num_recommendations=30):
    input_title = partial_name_matching(title, merged_df['Title'])
    return get_content_based_recommendations(cbf_model_loaded, get_collaborative_recommendations( model_loaded, input_title,num_recommendations))

recommendation_system("Superman")

You probably searched for : Superman: No Limits!


["The Actor's Book of Contemporary Stage Monologues (More Than 150 Monologues from Over 70 Playwrights)",
 "A Midsummer Night's Dream (Cliffs Complete)",
 'American Bee: The National Spelling Bee and the Culture of Word Nerds',
 'Life. Be There at Ten Til.: A Collection of Homegrown Wisdom',
 'Love Spell',
 'Your Perfect Right: Assertiveness and Equality in Your Life and Relationships',
 'Rangers and sovereignty,',
 'Superman: No Limits!',
 'Demon Box',
 "Dark Canyon (THE COLLECTOR'S EDITION)",
 'Killing Orders',
 "Caring for Your Child with Severe Food Allergies: Emotional Support and Practical Advice from a Parent Who's Been There",
 'The Franco-Prussian War 1870-1871 (Essential Histories)',
 'Teach Yourself Turbo C++ 4.5 for Windows in 21 Days (Teach Yourself in 21 Days)',
 "Queen's Gambit",
 'Kingdom Hearts Official Strategy Guide',
 'The Essential Whitewater Kayaker: A Complete Course',
 'Blame! Vol. 1',
 "You Just Don't Understand",
 'The hockey sweater',
 "Zane's Gettin' Buck Wi