In [None]:
import pandas as pd
import numpy as np
import re
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
DATA_PATH = "../backend/app/data_ingestion/sample_data.csv"
df = pd.read_csv(DATA_PATH)
df = df.fillna("")

# Clean categories
def safe_literal_eval(val):
    if not isinstance(val, str) or not val.startswith('['):
        return []
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return []

df['categories_clean'] = df['categories'].apply(safe_literal_eval)

# --- Create a combined text 'corpus' for NLP tasks ---
df['corpus'] = df['title'] + " " + \
               df['description'] + " " + \
               df['brand'] + " " + \
               df['material'] + " " + \
               df['color'] + " " + \
               df['categories_clean'].apply(lambda x: ' '.join(x))

print(f"Data loaded. Total items: {len(df)}")
print("\nExample corpus:")
print(df['corpus'][0])

In [None]:
# 1. Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# 2. Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(df['corpus'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print("This matrix represents each product as a vector of keyword scores.")

In [None]:
# 3. Compute the Cosine Similarity matrix
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

print(f"Cosine similarity matrix shape: {cosine_sim_matrix.shape}")
print("This matrix shows the similarity between every pair of products (0 to 1).")

In [None]:
# 4. Create a function to get recommendations
indices = pd.Series(df.index, index=df['uniq_id']).drop_duplicates()

def get_tfidf_recommendations(uniq_id, top_k=3):
    """Gets top_k recommendations for a given uniq_id."""
    if uniq_id not in indices:
        return f"Error: uniq_id {uniq_id} not found."
    
    # Get the index of the product
    idx = indices[uniq_id]
    
    # Get the pairwise similarity scores for this product
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    
    # Sort the products based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top_k most similar products (skip index 1, it's the item itself)
    top_product_indices = [i[0] for i in sim_scores[1:top_k+1]]
    top_product_scores = [i[1] for i in sim_scores[1:top_k+1]]
    
    # Return the titles and scores
    return df[['title', 'uniq_id']].iloc[top_product_indices].assign(similarity_score=top_product_scores)

In [None]:
# Test Item 1: The 'Modern Velvet Accent Chair'
test_id_1 = 'a1b2c3d4-0001'
print(f"--- Recommendations for: {df[df['uniq_id'] == test_id_1]['title'].values[0]} ---")
print(get_tfidf_recommendations(test_id_1, top_k=3))

In [None]:
# Test Item 2: The 'L-Shaped Computer Desk'
test_id_2 = 'a1b2c3d4-0005'
print(f"--- Recommendations for: {df[df['uniq_id'] == test_id_2]['title'].values[0]} ---")
print(get_tfidf_recommendations(test_id_2, top_k=3))
print("\nEvaluation: This model recommends items based on shared keywords.")

In [None]:
# 1. Load the embedding model (this is the same one used in the API)
print("Loading embedding model... This might take a moment.")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded.")

In [None]:
# 2. Generate embeddings for all product corpuses
print("Generating embeddings for all products...")
corpus_embeddings = embedding_model.encode(df['corpus'].tolist(), show_progress_bar=True)
print(f"Embeddings generated. Shape: {corpus_embeddings.shape}")
print("These are semantic vectors that understand meaning, not just keywords.")

In [None]:
# 3. Run K-Means Clustering
# For this small dataset, we'll pick k=3 (e.g., 'Chairs', 'Tables/Desks', 'Storage')
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(corpus_embeddings)

# Assign cluster labels back to the DataFrame
df['cluster'] = kmeans.labels_

print("Clustering complete. Cluster labels added to DataFrame.")
df[['title', 'cluster']].head()

In [None]:
# Reduce dimensions to 2D for plotting
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(corpus_embeddings)

df['pca_x'] = embeddings_2d[:, 0]
df['pca_y'] = embeddings_2d[:, 1]

# Plot the clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x='pca_x',
    y='pca_y',
    hue='cluster',
    palette=sns.color_palette("hsv", n_colors=num_clusters),
    data=df,
    legend="full",
    alpha=0.8
)
plt.title('Product Clusters (PCA Visualization)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()

print("Evaluation: This plot shows how K-Means has grouped semantically similar items.")

In [None]:
# Print the titles of items in each cluster to see what was grouped
for i in range(num_clusters):
    print(f"\n--- Cluster {i} ---")
    cluster_items = df[df['cluster'] == i]['title']
    print(cluster_items.to_markdown(index=False))
    
print("\nEvaluation: The clusters successfully group items by their semantic meaning (e.g., chairs, tables, storage).")