## Recommendation using TF-IDF and Cosine Similarity

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


DATASET_DIR = "dataset"
MODEL_DIR = "model_assets"

### 1. Load the clean data

In [2]:
try:
    df_recommender = pd.read_csv(os.path.join(DATASET_DIR, "books_cleaned.csv"))
    print("Cleaned Dataset loaded successfully.")

    try:
        df_full_metadata = pd.read_csv(os.path.join(DATASET_DIR, "books.csv"))
        print("Full Metadata Dataset loaded successfully.")
    except FileNotFoundError:
        print("Error: Full metadata dataset file not found.")
        df_full_metadata = None        

except FileNotFoundError:
    print("Error: Dataset file not found.")

Cleaned Dataset loaded successfully.
Full Metadata Dataset loaded successfully.


### 2. Calculate TF-IDF 

In [3]:
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=10000,
    min_df=5, # words must appear in at least 5 books
    max_df=0.85 # ignore words that appear in more than 85% of the books
)

print("Starting TF-IDF vectorization...")
tfidf_matrix = tfidf.fit_transform(df_recommender['combined_features'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


Starting TF-IDF vectorization...
TF-IDF matrix shape: (6810, 8419)


### 3. Similarity Calculation

In [4]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Cosine similarity matrix shape:", cosine_sim.shape)

output_similarity_path = os.path.join(MODEL_DIR, "cosine_sim.npy")
np.save(output_similarity_path, cosine_sim)
print(f"Cosine similarity matrix saved to {output_similarity_path}")




Cosine similarity matrix shape: (6810, 6810)
Cosine similarity matrix saved to model_assets/cosine_sim.npy


### 4. Recommendation

In [5]:
def get_recommendations(title, cosine_sim, df_recommender, df_full_metadata, N=10, print_output=True):

    indices = pd.Series(df_recommender.index, index=df_recommender['title']).drop_duplicates()

    # Check if the given title exists in the dataset
    if title not in indices:
        print(f"Error: Book title '{title}' not found.")
        return pd.DataFrame()

    # Get the index of the selected book
    idx = indices[title]

    sim_scores = sorted(
        enumerate(cosine_sim[idx]),
        key=lambda x: x[1],
        reverse=True
    )[1:N+1]


    # Separate book indices and similarity values
    book_indices, similarity_scores = zip(*sim_scores)

    recs = df_recommender.iloc[list(book_indices)][['isbn13']].copy()
    recs['Similarity Score'] = similarity_scores


    display_cols = [
        'title', 'subtitle', 'authors',
        'categories', 'published_year', 'average_rating'
    ]

    # Remove duplicate ISBNs to ensure safe merging
    df_full_metadata_unique = df_full_metadata.drop_duplicates('isbn13')

    final_df = recs.merge(
        df_full_metadata_unique[['isbn13'] + display_cols],
        on='isbn13',
        how='left'
    ).fillna({
        'subtitle': '',
        'authors': '',
        'categories': ''
    })

    if print_output:
        print(f"\nTop {len(final_df)} Recommendations for: **{title}**")
        print("-" * 50)

        for i, row in final_df.iterrows():
            print(f"{i+1}. **{row['title']}**")

            # Print subtitle only if available
            if row['subtitle']:
                print(f"    ({row['subtitle']})")

            # Clean category formatting and show authors
            print(f"    {row['categories'].strip('[]\"\'')}, by {row['authors']}")

            # Show rating only if present
            if not pd.isna(row['average_rating']):
                print(f"    ⭐ {row['average_rating']:.2f}")

            # Show similarity score
            print(f"    (Similarity: {row['Similarity Score']:.4f})\n")

    # Return final recommendations as a clean DataFrame
    return final_df.reset_index(drop=True)


In [7]:
# usage
recommedation_df = get_recommendations("Gilead", cosine_sim, df_recommender, df_full_metadata, N=10, print_output=True)


Top 10 Recommendations for: **Gilead**
--------------------------------------------------
1. **The Son Avenger**
    Fiction, by Sigrid Undset
    ⭐ 4.39
    (Similarity: 0.1649)

2. **The Martians**
    Fiction, by Kim Stanley Robinson
    ⭐ 3.56
    (Similarity: 0.1559)

3. **Four Baboons Adoring the Sun**
    Drama, by John Guare
    ⭐ 4.00
    (Similarity: 0.1544)

4. **The Deep End of the Ocean**
    Fiction, by Jacquelyn Mitchard
    ⭐ 3.86
    (Similarity: 0.1493)

5. **The Languages of Pao**
    Fiction, by Jack Vance
    ⭐ 3.80
    (Similarity: 0.1444)

6. **Robinson Crusoe**
    Fiction, by Daniel Defoe
    ⭐ 3.67
    (Similarity: 0.1441)

7. **Martin and John**
    (A Novel)
    Fiction, by Dale Peck
    ⭐ 3.73
    (Similarity: 0.1433)

8. **Song of Solomon**
    Fiction, by Toni Morrison
    ⭐ 4.04
    (Similarity: 0.1421)

9. **Children of the Alley**
    Fiction, by Najīb Maḥfūẓ
    ⭐ 4.10
    (Similarity: 0.1416)

10. **Elect Mr. Robinson for a Better World**
    Fictio