In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
df = pd.read_excel('Online Retail.xlsx')

# Select a smaller subset (first 100 rows) for testing
df = df.head(1000)

# Convert all descriptions to strings explicitly
df['Description'] = df['Description'].apply(lambda x: str(x) if isinstance(x, str) else str(x))

# Handle missing descriptions by replacing them with an empty string
df['Description'] = df['Description'].fillna('')

# Ensure the 'Description' column contains only strings
df['Description'] = df['Description'].astype(str)

# TF-IDF Vectorization of product descriptions
tfidf = TfidfVectorizer(stop_words='english')

# Transform the product descriptions into numerical vectors (sparse matrix format)
tfidf_matrix = tfidf.fit_transform(df['Description'])

# Calculate cosine similarity using sparse matrix operations (avoid dense matrix conversion)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a DataFrame for easier access to product names (descriptions)
df_cosine_sim = pd.DataFrame(cosine_sim, index=df['Description'], columns=df['Description'])

# Function to get recommendations based on product description
def get_recommendations(product_description, cosine_sim_matrix, df):
    # Get the index of the given product description
    idx = df[df['Description'] == product_description].index[0]
    
    # Get the similarity scores for the given product
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    
    # Sort the products based on similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude the product itself from the recommendations (the first result is always the product itself)
    sim_scores = sim_scores[1:]  # Exclude the product itself
    
    # Get product indices and their corresponding similarity scores
    product_indices = [i[0] for i in sim_scores]
    scores = [i[1] for i in sim_scores]
    
    # Get product descriptions for the most similar products
    recommended_products = df.iloc[product_indices]['Description']
    
    # Remove duplicates and ensure we get only the top 5 unique recommendations
    recommended_products = recommended_products[recommended_products != product_description].drop_duplicates().reset_index(drop=True)
    
    # If there are more than 5 recommendations, keep only the top 5
    recommended_products = recommended_products.head(5)
    scores = scores[:len(recommended_products)]
    
    return recommended_products, scores

# Example usage: Get recommendations for a product
product_description = 'WHITE HANGING HEART T-LIGHT HOLDER'  # Example product

recommended_products, scores = get_recommendations(product_description, cosine_sim, df)

# Display the recommended products and their similarity scores
print(f"Recommendations for '{product_description}':")
for product, score in zip(recommended_products, scores):
    print(f"{product} - Similarity Score: {score:.4f}")


Recommendations for 'WHITE HANGING HEART T-LIGHT HOLDER':
RED HANGING HEART T-LIGHT HOLDER - Similarity Score: 1.0000
HEART T-LIGHT HOLDER  - Similarity Score: 1.0000
HANGING HEART ZINC T-LIGHT HOLDER - Similarity Score: 1.0000
SILVER HANGING T-LIGHT HOLDER - Similarity Score: 1.0000
COLOUR GLASS T-LIGHT HOLDER HANGING - Similarity Score: 1.0000
