In [None]:
import json
import re
import string

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.neighbors import NearestNeighbors

## Data preprocessing

In [None]:
books_df = pd.read_csv('books_enriched.csv')
user_ratings_df = pd.read_csv('ratings.csv')

In [None]:
print(f"Number of books: {books_df.shape[0]}")
print(books_df.info())
display(books_df.describe())
print(books_df.isnull().sum())

In [None]:
print(user_ratings_df.info())
print(f"\nNumber of missing values:\n{user_ratings_df.isnull().sum()}")

In [None]:
def format_author_names(authors_str):
    try:
        authors_list = json.loads(authors_str.replace("'", '"'))
        return ', '.join(authors_list)
    except json.JSONDecodeError:
        return authors_str

In [None]:
books_df.drop(columns=['Unnamed: 0', 'index'], inplace=True)

books_df['description'] = books_df['description'].fillna('')
books_df['isbn'] = books_df['isbn'].fillna('Unknown')
books_df['isbn13'] = books_df['isbn13'].fillna(0).astype(int)
books_df['original_publication_year'] = books_df['original_publication_year'].fillna(0).astype(int)
books_df['original_title'] = books_df['original_title'].fillna('')
books_df['pages'] = books_df['pages'].fillna(0).astype(int)
books_df['authors'] = books_df['authors'].apply(format_author_names)
books_df['title'] = books_df['title'].str.replace(r"\s+", " ", regex=True)

In [None]:
def normalize_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuation
    return text

books_df['normalized_title'] = books_df['title'].apply(normalize_text)

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_titles = tfidf_vectorizer.fit_transform(books_df['normalized_title'])

In [None]:
def search_similar_books_by_title(query, df=books_df, tfidf_matrix=tfidf_matrix_titles, top_n=12, similarity_threshold=0.1):
    processed = re.sub("[^a-zA-Z0-9 ]", "", query.lower())
    query_vec = tfidf_vectorizer.transform([processed])

    cosine_similarities = linear_kernel(query_vec, tfidf_matrix).flatten()
    similar_indices = cosine_similarities.argsort()[::-1]
    filtered_indices = [idx for idx in similar_indices if cosine_similarities[idx] >= similarity_threshold]
    top_indices = filtered_indices[:top_n]
    if not top_indices:
        return pd.DataFrame()

    result_df = df.iloc[top_indices]
    result_df = result_df[['book_id', 'title', 'authors', 'average_rating', 'ratings_count', 'image_url', 'pages']]
    return result_df

In [None]:
similar_books = search_similar_books_by_title("1984")
display(similar_books)

## Collaborative filtering recommendation system using a k-NN model

In [None]:
# Creating a user-item matrix
user_item_matrix = user_ratings_df.pivot(index='user_id', columns='book_id', values='rating').fillna(0)

# Transform the matrix to a scipy sparse matrix
user_item_matrix_sparse = csr_matrix(user_item_matrix.values)

# Transpose the matrix so that books are rows
user_item_matrix_sparse_T = user_item_matrix_sparse.transpose()

# Train model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(user_item_matrix_sparse_T)

In [None]:
# Create a mapping from book IDs to matrix column indices
book_id_to_idx = {book_id: idx for idx, book_id in enumerate(user_item_matrix.columns)}

def collaborative_recommendations(book_id, top_n=10):
    # Check if the book_id is in the mapping
    if book_id not in book_id_to_idx:
        return pd.DataFrame()  # No recommendations if book_id is not in the mapping

    # Get the matrix index corresponding to the book_id
    book_idx = book_id_to_idx[book_id]

    # Query the model with the transposed matrix
    distances, indices = model_knn.kneighbors(user_item_matrix_sparse_T[book_idx].reshape(1, -1), n_neighbors=top_n+1)

    # Get the book indices of the neighbors
    book_indices = [user_item_matrix.columns[i] for i in indices.flatten()[1:]]

    # Return the recommended books
    return books_df[books_df['book_id'].isin(book_indices)]

## Content-based recommendation system using TF-IDF

In [None]:
books_df['combined_features'] = books_df['description'] + " " + books_df['genres']

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(books_df['combined_features'])

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def content_based_recommendations(book_id, top_n=10, cosine_sim=cosine_sim, df=books_df):
    if book_id not in df['book_id'].values:
        return pd.DataFrame()

    idx = df.index[df['book_id'] == book_id].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_n = min(top_n, len(sim_scores) - 1)
    sim_scores = sim_scores[1:top_n+1]

    book_indices = [i[0] for i in sim_scores]

    return df.iloc[book_indices]


## Content Based Recommendation System new

In [None]:
def content(books):
    books['content'] = (pd.Series(books[['authors', 'title', 'genres', 'description']]
                                  .fillna('')
                                  .values.tolist()
                                  ).str.join(' '))

    tf_content = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
    tfidf_matrix = tf_content.fit_transform(books['content'])
    cosine = linear_kernel(tfidf_matrix, tfidf_matrix)
    index = pd.Series(books.index, index=books['book_id'])

    return cosine, index

def content_recommendation(books, book_id, n=10):
    cosine_sim, indices = content(books)
    idx = indices[book_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n + 1]
    book_indices = [i[0] for i in sim_scores]
    return books.iloc[book_indices]

content_recommendation(books_df, 13)

## Content Based + Popularity-Rating Filter

In [None]:
def improved_recommendation(book_id, top_n=10):
    cosine_sim, indices = content(books_df)
    idx = indices[book_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:41]
    book_indices = [i[0] for i in sim_scores]
    books2 = books_df.iloc[book_indices][['book_id', 'title', 'authors', 'average_rating', 'ratings_count']]

    v = books2['ratings_count']
    m = books2['ratings_count'].quantile(0.75)
    R = books2['average_rating']
    C = books2['average_rating'].median()
    books2['new_score'] = (v / (v + m) * R) + (m / (m + v) * C)

    high_rating = books2[books2['ratings_count'] >= m]
    high_rating = high_rating.sort_values('new_score', ascending=False)

    return books_df.loc[high_rating.index]


In [None]:
improved_recommendation(13)

## Hybrid model, combining both systems

In [None]:
def hybrid_recommendations(book_id, top_n=10):
    content_recommendations = improved_recommendation(book_id, top_n=top_n)
    collab_recommendations = collaborative_recommendations(book_id, top_n=top_n)
    hybrid_recommendations_df = pd.concat([content_recommendations, collab_recommendations]).drop_duplicates().head(top_n)
    return hybrid_recommendations_df

In [None]:
recommended_books = hybrid_recommendations(book_id=13, top_n=20)
recommended_books.shape[0]
recommended_books

## Recommender query testing

In [None]:
# recommended_books = hybrid_recommendations(book_id=6, top_n=5)
recommended_books = content_based_recommendations(book_id=13, top_n=5)
json.loads(recommended_books[['book_id', 'title', 'authors', 'average_rating', 'ratings_count', 'image_url', 'description', 'genres']].to_json(orient='records'))

In [None]:
def get_recommendations_as_json(book_id, model_type='hybrid', top_n=20):
    columns_to_include = ['book_id', 'title', 'authors', 'average_rating', 'ratings_count', 'image_url', 'pages']
    if model_type == 'hybrid':
        recommendations_df = hybrid_recommendations(book_id, top_n)
    elif model_type == 'collaborative':
        recommendations_df = collaborative_recommendations(book_id, top_n)
    elif model_type == 'content':
        recommendations_df = content_based_recommendations(book_id, top_n)
    elif model_type == 'improved':
        recommendations_df = improved_recommendation(book_id, top_n)
    else:
        return "Invalid model type"

    recommendations_df = recommendations_df[columns_to_include]
    # Convert DataFrame to JSON
    recommendations_json = recommendations_df.to_json(orient='records')
    return recommendations_json

# Example usage
json_output = get_recommendations_as_json(book_id=13, model_type='improved', top_n=10)
json.loads(json_output)