In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from typing import List, Tuple

class BookRecommender:
    def __init__(self):
        self.books_df = None
        self.tfidf_matrix = None
        self.cosine_sim = None
        self.indices = None
    
    def load_and_preprocess_data(self, file_path: str) -> None:
        """
        Load and preprocess the books dataset
        """
        try:
            # Read CSV file with error handling
            self.books_df = pd.read_csv(file_path, 
                                         sep=';', 
                                         encoding='ISO-8859-1',  # Changed encoding to ISO-8859-1 to avoid decoding issues
                                         quoting=1,  # Handle quotes
                                         escapechar='\\',       # Handle escape characters
                                         on_bad_lines='skip')   # Skip bad lines
            
            # Clean column names
            self.books_df.columns = self.books_df.columns.str.strip('"')
            
            # Drop any rows with missing values
            self.books_df = self.books_df.dropna(subset=['Book-Title', 'Book-Author', 'Publisher', 'Year-Of-Publication'])
            
            # Create a combined features column for TF-IDF
            self.books_df['combined_features'] = self.books_df.apply(
                lambda x: f"{x['Book-Author']} {x['Publisher']} {x['Year-Of-Publication']}", 
                axis=1
            )
            
            # Create TF-IDF matrix as a sparse matrix to save memory
            tfidf = TfidfVectorizer(stop_words='english')
            self.tfidf_matrix = tfidf.fit_transform(self.books_df['combined_features'])
            
            # Calculate cosine similarity matrix for sparse matrix
            self.cosine_sim = cosine_similarity(self.tfidf_matrix, self.tfidf_matrix)
            
            # Create reverse mapping of book titles and indices
            self.indices = pd.Series(
                self.books_df.index, 
                index=self.books_df['Book-Title']
            ).drop_duplicates()
            
            print(f"Successfully loaded {len(self.books_df)} books.")
            
        except Exception as e:
            print(f"Error loading data: {str(e)}")
            raise
    
    def get_recommendations(self, title: str, n_recommendations: int = 5) -> List[Tuple[str, float]]:
        """
        Get book recommendations based on title similarity
        """
        try:
            # Get the index of the book
            idx = self.indices[title]
            
            # Get similarity scores for all books
            sim_scores = list(enumerate(self.cosine_sim[idx]))
            
            # Sort books based on similarity scores
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            
            # Get top N most similar books (excluding the input book)
            sim_scores = sim_scores[1:n_recommendations + 1]
            
            # Get book indices and similarity scores
            book_indices = [i[0] for i in sim_scores]
            similarity_scores = [i[1] for i in sim_scores]
            
            # Return recommended books with their similarity scores
            recommendations = [
                (self.books_df['Book-Title'].iloc[i], score) 
                for i, score in zip(book_indices, similarity_scores)
            ]
            
            return recommendations
            
        except KeyError:
            return [("Book not found in database", 0.0)]
    
    def get_book_details(self, title: str) -> dict:
        """
        Get detailed information about a specific book
        """
        try:
            book_info = self.books_df[self.books_df['Book-Title'] == title].iloc[0]
            return {
                'Title': book_info['Book-Title'],
                'Author': book_info['Book-Author'],
                'Year': book_info['Year-Of-Publication'],
                'Publisher': book_info['Publisher'],
                'ISBN': book_info['ISBN']
            }
        except IndexError:
            return {"Error": "Book not found in database"}
    
    def get_author_recommendations(self, author: str, n_recommendations: int = 5) -> List[str]:
        """
        Get recommendations for other books by the same author
        """
        author_books = self.books_df[
            self.books_df['Book-Author'].str.lower() == author.lower()
        ]['Book-Title'].tolist()
        
        return author_books[:n_recommendations]
    
    def get_similar_by_year(self, year: int, n_recommendations: int = 5) -> List[str]:
        """
        Get recommendations for books published in the same year
        """
        year_books = self.books_df[
            self.books_df['Year-Of-Publication'] == str(year)
        ]['Book-Title'].tolist()
        
        return year_books[:n_recommendations]


In [2]:
# Initialize and load data
recommender = BookRecommender()
recommender.load_and_preprocess_data('books.csv')

Error loading data: Unable to allocate 65.7 GiB for an array with shape (8821959009,) and data type int64


MemoryError: Unable to allocate 65.7 GiB for an array with shape (8821959009,) and data type int64