In [None]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests

In [None]:
df = pd.read_csv('data/books_1.Best_Books_Ever.csv')

In [None]:
def clean_text(text_list):
    return ' '.join(text.lower().replace(' ', '_') for text in text_list)

df['genres'] = df['genres'].apply(lambda x: clean_text(x.split(', ')))
df['setting'] = df['setting'].apply(lambda x: clean_text(x.split(', ')))

df['combined_text'] = df['genres'] + ' ' + df['setting']

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_text'])

In [None]:
# def batch_cosine_similarity(matrix, batch_size=100):
#     cosine_sim = np.zeros((matrix.shape[0], matrix.shape[0]))
#     for start_row in range(0, matrix.shape[0], batch_size):
#         end_row = min(start_row + batch_size, matrix.shape[0])
#         batch_cosine_sim = cosine_similarity(matrix[start_row:end_row], matrix)
#         cosine_sim[start_row:end_row, :] = batch_cosine_sim
#     return cosine_sim
# cosine_sim = batch_cosine_similarity(tfidf_matrix, batch_size=100)

# def save_cosine_similarity_matrix(cosine_sim, filename):
#     try:
#         directory = os.path.dirname(filename)
#         os.makedirs(directory, exist_ok=True)
#         np.save(filename, cosine_sim, allow_pickle=True)
#         print(f"Cosine similarity matrix saved to {filename}")
#     except Exception as e:
#         print(f"Error while saving cosine similarity matrix: {e}")

# save_cosine_similarity_matrix(cosine_sim, '/Users/vladimirmahlin/Documents/Dev/Python/books/cosine_similarity_matrix.npy')

In [None]:
def search_and_fetch_book_details(title):
    title_query = title.replace(" ", "+")
    
    search_url = f"https://openlibrary.org/search.json?title={title_query}"
    
    search_response = requests.get(search_url)
    if search_response.status_code == 200:
        search_data = search_response.json()
        
        if search_data['numFound'] > 0:
            book = search_data['docs'][0]
            subjects = book.get('subject', [])
            places = book.get('place', [])
            return subjects, places
        else:
            print("No books found with the given title.")
            return [], []
    else:
        print("Failed to search for books.")
        return [], []

In [None]:
cosine_sim = np.load('cosine_similarity_matrix.npy')

In [None]:
def recommend_books_from_openlibrary(title, tfidf_vectorizer, cosine_sim, df):
    subjects, places = search_and_fetch_book_details(title)
    
    if not subjects and not places:
        print("No detailed information found for this title.")
        return
    
    combined_text = ' '.join(subjects + places).lower().replace(' ', '_')
    query_vector = tfidf_vectorizer.transform([combined_text])
    cosine_sim_scores = cosine_similarity(query_vector, tfidf_vectorizer.transform(df['combined_text']))
    scores = cosine_sim_scores.flatten()
    book_indices = scores.argsort()[-10:][::-1]
    recommended_books = df.iloc[book_indices]
    
    print("Recommended Books from OpenLibrary:")
    for index, row in recommended_books.iterrows():
        print(f"\n{row['title']}\n   Genres: {'; '.join(row['genres'].split(' ')).capitalize()}\n   Setting: {'; '.join(row['setting'].split(' ')).capitalize()}")
        print("-" * 60)

In [None]:
def recommend_books_from_db(title, cosine_sim=cosine_sim, df=df):
    title = title.lower()
    title_indices = df.index[df['title'].str.lower().str.contains(title, na=False)]
    
    if not title_indices.empty:
        title_index = title_indices[0]
        
        if title_index < cosine_sim.shape[0]:
            sim_scores = list(enumerate(cosine_sim[title_index]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            top_books = sim_scores[1:11]
            book_indices = [i[0] for i in top_books]
            recommended_books = df[['title', 'genres', 'setting']].iloc[book_indices]

            print("Recommended Books from Your Database:")
            for idx, row in recommended_books.iterrows():
                print(f"\n{row['title']}\n   Genres: {'; '.join(row['genres'].split(', ')).capitalize()}\n   Setting: {'; '.join(row['setting'].split(', ')).capitalize()}")
                print("-" * 60)

        else:
            print(f"Invalid index: {title_index}")
    else:
        print(f"No books found with the title '{title}' in your database.")

In [None]:
# Example usage from db
recommend_books_from_db("Crime and Punishment", cosine_sim)

In [None]:
# Example usage from OpenLibrary
recommend_books_from_openlibrary("Crime and Punishment", tfidf, cosine_sim, df)