In [1]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
# Step 2: Load the dataset
df = pd.read_csv('data.csv', encoding='ISO-8859-1')

In [5]:
# Step 3: Preprocess
df.dropna(subset=['Description'], inplace=True)
df = df.drop_duplicates(subset=['Description'])

In [7]:
# Optional: Reset index
df = df.reset_index(drop=True)

In [9]:
# Step 4: TF-IDF Vectorization on product descriptions
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['Description'])

In [11]:
# Step 5: Compute cosine similarity between items
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [13]:
# Step 6: Create a mapping of product index to name
indices = pd.Series(df.index, index=df['Description']).drop_duplicates()

In [15]:
# Step 7: Recommendation function
def recommend_products(description, cosine_sim=cosine_sim):
    idx = indices.get(description)
    if idx is None:
        return "Product description not found."
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # Top 5 similar

    product_indices = [i[0] for i in sim_scores]
    return df['Description'].iloc[product_indices]

In [17]:
# Example usage
print("Recommended Products:\n")
print(recommend_products('WHITE HANGING HEART T-LIGHT HOLDER'))

Recommended Products:

4036     PINK HANGING HEART T-LIGHT HOLDER
58        RED HANGING HEART T-LIGHT HOLDER
4221    CREAM HANGING HEART T-LIGHT HOLDER
167      HANGING HEART ZINC T-LIGHT HOLDER
505                  HEART T-LIGHT HOLDER 
Name: Description, dtype: object


In [19]:
from sklearn.metrics import precision_score, recall_score
from tqdm import tqdm

# Step 1: Create user-item mapping from transaction data
user_item_df = df[['CustomerID', 'Description']].dropna()
user_items = user_item_df.groupby('CustomerID')['Description'].apply(set)

In [21]:
# Step 2: Recommend top-K similar items for each product a user interacted with
def get_top_k_similar_items(product, k=5):
    idx = indices.get(product)
    if idx is None:
        return []
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_k = sim_scores[1:k+1]  # exclude self
    return df['Description'].iloc[[i[0] for i in top_k]].tolist()


In [23]:
# Step 3: Evaluate over a subset of users
k = 5
precisions = []
recalls = []
users_sample = user_items.sample(n=100, random_state=42)  # sample users to limit computation

for user, items in tqdm(users_sample.items(), desc="Evaluating"):
    recommended = set()
    ground_truth = set(items)
    
    for item in items:
        recommended.update(get_top_k_similar_items(item, k=k))
    
    recommended = set(recommended) - set([None])
    true_positives = recommended & ground_truth
    
    precision = len(true_positives) / len(recommended) if recommended else 0
    recall = len(true_positives) / len(ground_truth) if ground_truth else 0
    
    precisions.append(precision)
    recalls.append(recall)

Evaluating: 100it [00:01, 81.11it/s]


In [25]:

# Step 4: Print average metrics
avg_precision = sum(precisions) / len(precisions)
avg_recall = sum(recalls) / len(recalls)

print(f"\nAverage Precision@{k}: {avg_precision:.4f}")
print(f"Average Recall@{k}: {avg_recall:.4f}")


Average Precision@5: 0.0385
Average Recall@5: 0.1588
