In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import plotly.express as px
from rapidfuzz import process

# --- Data Loading and Preprocessing ---

# Load the data
data = pd.read_csv(r"C:\Users\NIRMAL\Downloads\books_data.csv")

# Show first 5 rows and info
print("First 5 rows of the dataset:")
print(data.head())
print("\nDataset Info:")
data.info()

# Drop rows where both title and authors are missing
data = data.dropna(subset=['title', 'authors'], how='all')

# Convert average_rating to numeric
data['average_rating'] = pd.to_numeric(data['average_rating'], errors='coerce')

# Combine title and authors for content
data['book_content'] = data['title'].fillna('') + ' ' + data['authors'].fillna('')

# --- Data Visualization ---

# Plot histogram of average ratings
fig = px.histogram(data, x='average_rating', nbins=30, title='Distribution of Average Ratings')
fig.update_xaxes(title_text='Average Rating')
fig.update_yaxes(title_text='Frequency')
fig.show()

# Plot top 10 authors
top_authors = data['authors'].value_counts().head(10)
fig = px.bar(top_authors, x=top_authors.values, y=top_authors.index, orientation='h',
             labels={'x': 'Number of Books', 'y': 'Author'},
             title='Number of Books per Author')
fig.show()

# --- TF-IDF Vectorization (with feature limit for speed) ---

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['book_content'])

# --- Flexible Search Functions ---

def multi_field_search(query, data):
    """Case-insensitive, partial match in title or authors."""
    mask = (
        data['title'].str.lower().str.contains(query.lower()) |
        data['authors'].str.lower().str.contains(query.lower())
    )
    return data[mask]

def fuzzy_search(query, data, limit=3):
    """Fuzzy match in titles and authors, returns best matches."""
    choices = (data['title'] + ' ' + data['authors']).tolist()
    results = process.extract(query, choices, limit=limit, score_cutoff=60)
    matched_indices = [idx for _, _, idx in results]
    return data.iloc[matched_indices]

def find_best_match(query, data):
    """Try multi-field search, fallback to fuzzy search if no results."""
    matches = multi_field_search(query, data)
    if not matches.empty:
        return matches
    # Fallback to fuzzy search
    matches = fuzzy_search(query, data)
    return matches

# --- Recommendation Function (on-demand similarity calculation) ---

def recommend_books(book_query, tfidf_matrix=tfidf_matrix, data=data, vectorizer=tfidf_vectorizer):
    matches = find_best_match(book_query, data)
    if matches.empty:
        return f"❌ No books found matching '{book_query}'."
    idx = matches.index[0]
    query_vec = tfidf_matrix[idx]
    cosine_similarities = linear_kernel(query_vec, tfidf_matrix).flatten()
    sim_scores = list(enumerate(cosine_similarities))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Exclude the book itself
    book_indices = [i[0] for i in sim_scores]
    recommendations = data.iloc[book_indices][['title', 'authors', 'average_rating']]
    return recommendations, matches.iloc[0]['title']

# --- Main Execution ---

if __name__ == "__main__":
    book_query = input("Enter a book title or author: ")
    result = recommend_books(book_query)

    if isinstance(result, str):
        print(result)
    else:
        recommended_books, matched_title = result
        print(f"\n📚 Books similar to '{matched_title}':\n")
        for i, row in recommended_books.iterrows():
            print(f"{i+1}. {row['title']} by {row['authors']} (Rating: {row['average_rating']})")


First 5 rows of the dataset:
   bookID                                              title  \
0       1  Harry Potter and the Half-Blood Prince (Harry ...   
1       2  Harry Potter and the Order of the Phoenix (Har...   
2       4  Harry Potter and the Chamber of Secrets (Harry...   
3       5  Harry Potter and the Prisoner of Azkaban (Harr...   
4       8  Harry Potter Boxed Set  Books 1-5 (Harry Potte...   

                      authors average_rating  
0  J.K. Rowling/Mary GrandPré           4.57  
1  J.K. Rowling/Mary GrandPré           4.49  
2                J.K. Rowling           4.42  
3  J.K. Rowling/Mary GrandPré           4.56  
4  J.K. Rowling/Mary GrandPré           4.78  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   bookID          11127 non-null  int64 
 1   title           11127 non-null  object
 2   


📚 Books similar to 'Harry Potter and the Half-Blood Prince (Harry Potter  #6)':

616. Harry Potter and the Half-Blood Prince (Harry Potter  #6) by J.K. Rowling (Rating: 4.57)
8877. Harry Potter and the Sorcerer's Stone (Harry Potter  #1) by J.K. Rowling/Mary GrandPré (Rating: 4.47)
4417. Harry Potter and the Chamber of Secrets (Harry Potter  #2) by J.K. Rowling/Mary GrandPré (Rating: 4.42)
4. Harry Potter and the Prisoner of Azkaban (Harry Potter  #3) by J.K. Rowling/Mary GrandPré (Rating: 4.56)
10680. Harry Potter and the Goblet of Fire (Harry Potter  #4) by J.K. Rowling (Rating: 4.56)
2. Harry Potter and the Order of the Phoenix (Harry Potter  #5) by J.K. Rowling/Mary GrandPré (Rating: 4.49)
5. Harry Potter Boxed Set  Books 1-5 (Harry Potter  #1-5) by J.K. Rowling/Mary GrandPré (Rating: 4.78)
989. Harry Potter Y La Piedra Filosofal (Harry Potter  #1) by J.K. Rowling (Rating: 4.47)
10679. Harry Potter and the Philosopher's Stone (Harry Potter  #1) by J.K. Rowling (Rating: 4.47)
3. Ha