In [40]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [41]:
df = pd.read_csv(
    r"books.csv",
    on_bad_lines="skip"
)

print(df.shape)
print(df.head())
print(df.columns)

(11123, 12)
   bookID                                              title  \
0       1  Harry Potter and the Half-Blood Prince (Harry ...   
1       2  Harry Potter and the Order of the Phoenix (Har...   
2       4  Harry Potter and the Chamber of Secrets (Harry...   
3       5  Harry Potter and the Prisoner of Azkaban (Harr...   
4       8  Harry Potter Boxed Set  Books 1-5 (Harry Potte...   

                      authors  average_rating        isbn         isbn13  \
0  J.K. Rowling/Mary GrandPré            4.57  0439785960  9780439785969   
1  J.K. Rowling/Mary GrandPré            4.49  0439358078  9780439358071   
2                J.K. Rowling            4.42  0439554896  9780439554893   
3  J.K. Rowling/Mary GrandPré            4.56  043965548X  9780439655484   
4  J.K. Rowling/Mary GrandPré            4.78  0439682584  9780439682589   

  language_code    num_pages  ratings_count  text_reviews_count  \
0           eng          652        2095690               27591   
1           

In [42]:
df.isnull().sum()


bookID                0
title                 0
authors               0
average_rating        0
isbn                  0
isbn13                0
language_code         0
  num_pages           0
ratings_count         0
text_reviews_count    0
publication_date      0
publisher             0
dtype: int64

In [43]:
df = df[['title', 'authors', 'publisher', 'average_rating', 'ratings_count']]
df[['title', 'authors', 'publisher']] = df[['title', 'authors', 'publisher']].fillna('')
df[['average_rating', 'ratings_count']] = df[['average_rating', 'ratings_count']].fillna(0)


In [44]:
df['content'] = (
    df['title'] + ' ' +
    df['authors'] + ' ' +
    df['publisher']
)

In [45]:
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(df['content'])

In [46]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [47]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()


In [48]:
def recommend_books(book_title, num_recommendations=5):
    idx = indices.get(book_title)

    if idx is None:
        return "Book not found in dataset."

    similarity_scores = list(enumerate(cosine_sim[idx]))

    # Ignore the book itself
    similarity_scores = similarity_scores[1:]

    # Sort by similarity
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get top candidates
    top_indices = [i[0] for i in similarity_scores[:20]]

    # Create a dataframe of candidates
    recommendations = df.iloc[top_indices].copy()

    # Sort by average rating
    recommendations = recommendations.sort_values(
        by='average_rating', ascending=False
    )

    return recommendations[['title', 'authors', 'average_rating']].head(num_recommendations)


In [49]:
def precision_at_k(book_title, k=5, rating_threshold=4.0):
    recommendations = recommend_books(book_title, num_recommendations=k)

    if isinstance(recommendations, str):
        return recommendations

    relevant = recommendations[
        recommendations['average_rating'] >= rating_threshold
    ]

    precision = len(relevant) / k
    return precision


In [50]:
print(precision_at_k("The Hobbit", k=5))
recommend_books("The Hobbit")


1.0


Unnamed: 0,title,authors,average_rating
4272,The Lord of the Rings / The Hobbit,J.R.R. Tolkien,4.59
21,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,4.59
4255,The Return of the King (The Lord of the Rings ...,J.R.R. Tolkien,4.53
4597,Pictures by J.R.R. Tolkien,J.R.R. Tolkien/Christopher Tolkien,4.47
1699,Poems From The Hobbit,J.R.R. Tolkien,4.3
