# Ebeddings challenge


In [None]:
# Book Recommender - Student Version
# Final Project — Embeddings and Semantic Understanding

from faker import Faker
import random
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Optional: from sklearn.metrics.pairwise import cosine_similarity

# === NOTES FOR ALL PLATFORMS ===
# - If you are on Windows, make sure you install dependencies with:
#     pip install pandas faker sentence-transformers scikit-learn
#
# - If you're on Mac and using Apple Silicon (M1/M2/M3), you may face issues
#   if your model runs on 'mps' (Metal Performance Shaders).
#   This can break compatibility with sklearn. If that happens, use .cpu().numpy()
#   or just force the model to run on CPU by:
#       import torch; torch.device("cpu")
#   and avoid `.to("mps")` or `.to("cuda")`.

# ---------------------
# STEP 1. Generate Users
# ---------------------

def generate_users(num_users=50):
    faker = Faker()
    random.seed(42)
    users = []
    for i in range(num_users):
        user_id = f"u{i:03d}"
        profile = faker.job() + " who enjoys " + faker.word()
        reviews = [faker.sentence(nb_words=10) for _ in range(random.randint(2, 4))]
        ratings = [random.randint(1, 5) for _ in range(len(reviews))]
        users.append({
            "user_id": user_id,
            "profile": profile,
            "reviews": reviews,
            "ratings": ratings
        })
    return pd.DataFrame(users)

# ---------------------
# STEP 2. Generate Books
# ---------------------

def generate_books(num_books=100):
    faker = Faker()
    books = []
    for i in range(num_books):
        book_id = f"b{i:03d}"
        title = faker.sentence(nb_words=4).rstrip('.')
        description = faker.paragraph(nb_sentences=3)
        books.append({
            "book_id": book_id,
            "title": title,
            "description": description
        })
    return pd.DataFrame(books)

# ---------------------
# STEP 3. Save Data to CSV (Optional for viewing)
# ---------------------

def save_datasets(users, books):
    users.to_csv("users.csv", index=False)
    books.to_csv("books.csv", index=False)

# ---------------------
# STEP 4. Recommender Logic (TO COMPLETE)
# ---------------------

def recommend_books(user_df, book_df, model, top_k=3):
    recommendations = {}

    # Prepare book input text: title + description
    book_texts = (book_df["title"] + " " + book_df["description"]).tolist()

    # === WARNING: This returns Torch tensors — not NumPy ===
    book_embeddings = model.encode(book_texts, convert_to_tensor=True).cpu().numpy()
    # Optional: convert to numpy if needed
    # book_embeddings = book_embeddings.cpu().numpy()

    for idx, user in user_df.iterrows():
        # Combine user's profile and reviews
        user_text = user["profile"] + " " + " ".join(user["reviews"])

        # === EMBEDDING FOR USER INPUT ===
        user_embedding = model.encode([user_text], convert_to_tensor=True).cpu().numpy()
        
        
        # Compute cosine similarity between user and all books
        similarities = cosine_similarity(user_embedding, book_embeddings)[0]
        
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        # Get top_k recommended book_ids
        top_books = [book_df.iloc[i]["book_id"] for i in top_indices]
        
        recommendations[user["user_id"]] = top_books

        # === YOUR TASK STARTS HERE ===

    return recommendations

# ---------------------
# STEP 5. Main Run
# ---------------------

def main():
    print("Generating synthetic datasets...")
    users = generate_users()
    books = generate_books()
    save_datasets(users, books)

    print("Loading sentence embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")

    print("Generating recommendations...\n")
    recommendations = recommend_books(users, books, model)

    print("Sample output:")
    for user_id, book_ids in list(recommendations.items())[:5]:
        print(f"User {user_id} → Recommended Books: {book_ids}")

if __name__ == "__main__":
    main()


Generating synthetic datasets...
Loading sentence embedding model...
Generating recommendations...

Sample output:
User u000 → Recommended Books: ['b084', 'b055', 'b049']
User u001 → Recommended Books: ['b090', 'b040', 'b086']
User u002 → Recommended Books: ['b070', 'b075', 'b061']
User u003 → Recommended Books: ['b013', 'b096', 'b048']
User u004 → Recommended Books: ['b089', 'b018', 'b012']
