In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

# Load CSV
df = pd.read_csv(r"C:\Users\sheri\Desktop\Ironhack\book_recommender_project\data\clean\books.csv")
df.head()


Unnamed: 0,uid,title,author,genre,description,image_url
0,a897fe39b1053632,A Light in the Attic,Author Unknown,Poetry,It's hard to imagine a world without A Light i...,https://books.toscrape.com/media/cache/fe/72/f...
1,90fa61229261140a,Tipping the Velvet,Author Unknown,Historical Fiction,"""Erotic and absorbing...Written with starling ...",https://books.toscrape.com/media/cache/08/e9/0...
2,6957f44c3847a760,Soumission,Author Unknown,Fiction,"Dans une France assez proche de la nôtre, un h...",https://books.toscrape.com/media/cache/ee/cf/e...
3,e00eb4fd7b871a48,Sharp Objects,Author Unknown,Mystery,"WICKED above her hipbone, GIRL across her hear...",https://books.toscrape.com/media/cache/c0/59/c...
4,4165285e1663650f,Sapiens: A Brief History of Humankind,Author Unknown,History,From a renowned historian comes a groundbreaki...,https://books.toscrape.com/media/cache/ce/5f/c...


In [3]:
# Combine fields into one text string per book
df["features"] = (
    df["title"].astype(str) + " "
    + df["author"].astype(str) + " "
    + df["genre"].astype(str) + " "
    + df["description"].astype(str)
)


In [4]:

# TF-IDF representation
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df["features"])

# Choose number of clusters
k = 10
kmeans = KMeans(n_clusters=k, random_state=42)
df["cluster"] = kmeans.fit_predict(X)


In [6]:
def recommend_books(book_title, top_n=10):
    # Find all titles that contain the search term (case-insensitive)
    matches = df[df["title"].str.lower().str.contains(book_title.lower(), na=False)]
    
    if matches.empty:
        print(f"Book '{book_title}' not found in dataset.")
        return pd.DataFrame()
    
    # Just take the first match
    idx = matches.index[0]
    found_title = df.loc[idx, "title"]
    print(f"Found match: '{found_title}'")
    
    # Find the cluster of that book
    cluster_id = df.loc[idx, "cluster"]
    cluster_books = df[df["cluster"] == cluster_id].index
    
    # Compute cosine similarity within the cluster
    sims = cosine_similarity(X[idx], X[cluster_books]).flatten()
    
    # Sort by similarity
    sorted_indices = cluster_books[sims.argsort()[::-1]]
    
    # Remove the input book itself
    sorted_indices = sorted_indices[sorted_indices != idx]
    
    # Return top N recommended books
    return df.iloc[sorted_indices[:top_n]][["title", "author", "genre", "image_url"]]
recommend_books("harry potter",6)

Found match: 'Harry Potter and the Deathly Hallows (Harry Potter #7)'


Unnamed: 0,title,author,genre,image_url
1003,Harry Potter,J. K. Rowling,"Fantasy fiction, English",http://books.google.com/books/content?id=18LKP...
1000,Harry Potter and the Deathly Hallows,J.K. Rowling,Juvenile Fiction,http://books.google.com/books/content?id=gCtaz...
674,Harry Potter and the Half-Blood Prince (Harry ...,Author Unknown,Fantasy,https://books.toscrape.com/media/cache/06/ee/0...
1112,Harry Potter e a Pedra Filosofal,"J. K. Rowling, Lia Wyler","Children's stories, English",http://books.google.com/books/content?id=DqvrP...
1209,Friends and Foes of Harry Potter,"Nikita Agarwal, Chitra Agarwal","Children's stories, English",http://books.google.com/books/content?id=JGQBc...
1039,Harry Potter Boxed Set,J. K. Rowling,Juvenile Fiction,http://books.google.com/books/content?id=QCp6P...


In [None]:
!streamlit run App.py