In [2]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [7]:
df = pd.read_csv(
    r'C:\Users\siddh\OneDrive\Desktop\project\Book Recomender\dataset\books.csv',
    on_bad_lines='skip'  # newer option
)


In [8]:
print(df.shape)
df.head()


(11123, 12)


Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11123 entries, 0 to 11122
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11123 non-null  int64  
 1   title               11123 non-null  object 
 2   authors             11123 non-null  object 
 3   average_rating      11123 non-null  float64
 4   isbn                11123 non-null  object 
 5   isbn13              11123 non-null  int64  
 6   language_code       11123 non-null  object 
 7     num_pages         11123 non-null  int64  
 8   ratings_count       11123 non-null  int64  
 9   text_reviews_count  11123 non-null  int64  
 10  publication_date    11123 non-null  object 
 11  publisher           11123 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB


In [10]:
df.isnull().sum()

bookID                0
title                 0
authors               0
average_rating        0
isbn                  0
isbn13                0
language_code         0
  num_pages           0
ratings_count         0
text_reviews_count    0
publication_date      0
publisher             0
dtype: int64

In [11]:
df['combined_features'] = (df['title']+ ' '+df['authors']+' '+df['publisher']+''+df['language_code'])

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [16]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [19]:
cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [35]:
import difflib

def get_recommendations(title, cosine_sim=cosine_sim):
    titles = df['title'].tolist()
    
    # Find close matches (up to 5 similar titles)
    close_matches = difflib.get_close_matches(title, titles, n=5, cutoff=0.6)
    
    if not close_matches:
        print(f"No close match found for '{title}'.")
        return []
    
    # From the close matches, pick the one with highest ratings_count
    best_match = df[df['title'].isin(close_matches)].sort_values(
        by='ratings_count', ascending=False
    ).iloc[0]['title']
    
    idx = df[df['title'] == best_match].index[0]
    
    print(f"Found best match: '{best_match}'")

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Only top 5 similar books
    sim_scores = sim_scores[1:6]
    book_indices = [i[0] for i in sim_scores]

    return df[['title', 'authors', 'publisher']].iloc[book_indices]



In [36]:
recommended_books = get_recommendations("Harry Potter and the Half-Blood Prince")
print(recommended_books)

Found best match: 'Harry Potter and the Half-Blood Prince (Harry Potter  #6)'
                                                  title  \
615   Harry Potter and the Half-Blood Prince (Harry ...   
1     Harry Potter and the Order of the Phoenix (Har...   
3     Harry Potter and the Prisoner of Azkaban (Harr...   
4415  Harry Potter and the Chamber of Secrets (Harry...   
4     Harry Potter Boxed Set  Books 1-5 (Harry Potte...   

                         authors                                 publisher  
615                 J.K. Rowling                     Bloomsbury Publishing  
1     J.K. Rowling/Mary GrandPré                           Scholastic Inc.  
3     J.K. Rowling/Mary GrandPré                           Scholastic Inc.  
4415  J.K. Rowling/Mary GrandPré  Arthur A. Levine Books / Scholastic Inc.  
4     J.K. Rowling/Mary GrandPré                                Scholastic  
