# Recommendation Systems - Implementation

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## Load the dataset

In [2]:
df = pd.read_csv('data.csv', encoding="ISO-8859-1")
display(df)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France


## Load the cluster information from GMM

In [3]:
customer_info = pd.read_csv('detailed_data_5_clusters.csv')
display(customer_info)

Unnamed: 0,CustomerID,Cluster,Recency,Frequency,Monetary,UniqueItemsPurchased,AvgDaysBetweenPurchases,FavoriteShoppingHour,Is_UK,Cancellation_Frequency,Cancellation_Rate,Total_Spend,Monthly_Spending_Mean,Monthly_Spending_Std,Spending_Trend,FavoriteShoppingDay
0,12346,2,325,2,0.00,1,0.0,10,1,1.0,0.500000,0.00,0.000000,0.000000,0.000000,1
1,12347,0,1,7,4310.00,103,2.0,14,0,0.0,0.000000,4310.00,615.714286,341.070789,4.486071,1
2,12348,0,74,4,1437.24,21,10.0,19,0,0.0,0.000000,1437.24,359.310000,203.875689,-100.884000,3
3,12349,0,18,1,1457.55,72,0.0,9,0,0.0,0.000000,1457.55,1457.550000,0.000000,0.000000,0
4,12350,1,309,1,294.40,16,0.0,16,0,0.0,0.000000,294.40,294.400000,0.000000,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4357,18280,1,277,1,180.60,10,0.0,9,1,0.0,0.000000,180.60,180.600000,0.000000,0.000000,0
4358,18281,3,180,1,80.82,7,0.0,10,1,0.0,0.000000,80.82,80.820000,0.000000,0.000000,6
4359,18282,2,7,3,176.60,12,9.0,13,1,1.0,0.333333,176.60,88.300000,14.792674,-20.920000,4
4360,18283,0,3,16,2088.93,262,0.0,14,1,0.0,0.000000,2088.93,208.893000,168.947794,22.437394,3


## Combine the datasets

In [4]:
# Step 2: Ensure consistent data types for CustomerID in both DataFrames
df['CustomerID'] = df['CustomerID'].astype(float)
customer_info['CustomerID'] = customer_info['CustomerID'].astype(float)

# Step 3: Merge transactional data with customer cluster information
merged_data = df.merge(customer_info[['CustomerID', 'Cluster']], on='CustomerID', how='inner')


# Collaborative Filtering

In [5]:
# Step 4: Identify the top 10 best-selling products in each cluster
best_selling_products = (
    merged_data.groupby(['Cluster', 'StockCode', 'Description'])['Quantity']
    .sum()
    .reset_index()
    .sort_values(by=['Cluster', 'Quantity'], ascending=[True, False])
)

# Get the top 10 products for each cluster
top_products_per_cluster = best_selling_products.groupby('Cluster').head(10)

# Step 5: Record products purchased by each customer
customer_purchases = (
    merged_data.groupby(['CustomerID', 'Cluster', 'StockCode'])['Quantity']
    .sum()
    .reset_index()
)

# Step 6: Generate recommendations for each customer
recommendations = []

# Loop through each cluster
for cluster in top_products_per_cluster['Cluster'].unique():
    # Get top products in the current cluster
    top_products = top_products_per_cluster[top_products_per_cluster['Cluster'] == cluster]
    
    # Get customers belonging to the current cluster
    customers_in_cluster = customer_info[customer_info['Cluster'] == cluster]['CustomerID']
    
    for customer in customers_in_cluster:
        # Products already purchased by the customer in this cluster
        customer_purchased = customer_purchases[
            (customer_purchases['CustomerID'] == customer) & 
            (customer_purchases['Cluster'] == cluster)
        ]['StockCode'].tolist()
        
        # Recommend top 3 products not yet purchased
        top_not_purchased = top_products[~top_products['StockCode'].isin(customer_purchased)]
        top_3_not_purchased = top_not_purchased.head(3)
        
        # Append recommendations for the customer
        recommendations.append([
            customer, cluster, 
            *top_3_not_purchased[['StockCode', 'Description']].values.flatten().tolist()
        ])

# Step 7: Create a recommendations DataFrame
columns = ['CustomerID', 'Cluster', 
           'Rec1_StockCode', 'Rec1_Description', 
           'Rec2_StockCode', 'Rec2_Description', 
           'Rec3_StockCode', 'Rec3_Description']
recommendations_df = pd.DataFrame(recommendations, columns=columns)

# Step 8: Merge recommendations with customer_info
final_data = customer_info.merge(recommendations_df, on=['CustomerID', 'Cluster'], how='right')

# Save the final recommendations to a CSV file
final_data.to_csv('customer_recommendations.csv', index=False)
print("Recommendations saved to 'customer_recommendations.csv'")

Recommendations saved to 'customer_recommendations.csv'


In [6]:
# Step 7: Function to get recommendations for a specific CustomerID
def get_recommendations(customer_id):
    # Ensure the CustomerID is in the data
    if customer_id not in customer_info['CustomerID'].values:
        return f"CustomerID {customer_id} not found."
    
    # Get the cluster for the customer
    customer_cluster = customer_info.loc[customer_info['CustomerID'] == customer_id, 'Cluster'].values[0]
    
    # Get the top products for the customer's cluster
    top_products = top_products_per_cluster[top_products_per_cluster['Cluster'] == customer_cluster]
    
    # Get products the customer has already purchased
    customer_purchased = customer_purchases[
        (customer_purchases['CustomerID'] == customer_id) & 
        (customer_purchases['Cluster'] == customer_cluster)
    ]['StockCode'].tolist()
    
    # Find products not purchased by the customer
    top_not_purchased = top_products[~top_products['StockCode'].isin(customer_purchased)]
    top_3_not_purchased = top_not_purchased.head(3)
    
    # If there are no products to recommend
    if top_3_not_purchased.empty:
        return f"No new recommendations available for CustomerID {customer_id} in Cluster {customer_cluster}."
    
    # Format the recommendations
    recommendations = top_3_not_purchased[['StockCode', 'Description']].values.tolist()
    return f"Recommendations for CustomerID {customer_id} in Cluster {customer_cluster}:\n" + \
           "\n".join([f"{i+1}. {rec[1]} (StockCode: {rec[0]})" for i, rec in enumerate(recommendations)])


In [7]:
# Step 8: Input a CustomerID and get recommendations
customer_id_input = float(12348)  # Replace with an actual CustomerID for testing
print(get_recommendations(customer_id_input))

Recommendations for CustomerID 12348.0 in Cluster 0:
1. SMALL CHINESE STYLE SCISSOR (StockCode: 16014)
2. WORLD WAR 2 GLIDERS ASSTD DESIGNS (StockCode: 84077)
3. ASSORTED COLOUR BIRD ORNAMENT (StockCode: 84879)


## Reload dataset for next task

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load the datasets
df = pd.read_csv('data.csv', encoding="ISO-8859-1")  # Replace with your actual file
customer_info = pd.read_csv('detailed_data_5_clusters.csv')

# Ensure consistent data types for CustomerID
df['CustomerID'] = df['CustomerID'].astype(float)
customer_info['CustomerID'] = customer_info['CustomerID'].astype(float)

# Merge transactional data with customer cluster information
df = df.merge(customer_info[['CustomerID', 'Cluster']], on='CustomerID', how='inner')



# Content based Filtering

In [9]:

# Preprocess product descriptions
df['Description'] = df['Description'].fillna('').astype(str)

# Compute TF-IDF matrix for product descriptions
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)  # Limit features for efficiency
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Description'])

In [10]:
# Step 2: Function to compute product similarity dynamically
def get_similar_products(product_id, num_recommendations=3):
    """
    Compute similar products for a given product ID on demand.
    """
    if product_id not in df['StockCode'].values:
        return f"Product ID {product_id} not found."
    
    # Get the index of the product
    product_idx = df[df['StockCode'] == product_id].index[0]
    
    # Compute cosine similarity for this product only
    similarity_scores = cosine_similarity(tfidf_matrix[product_idx], tfidf_matrix).flatten()
    
    # Get top recommendations
    similar_indices = similarity_scores.argsort()[::-1]
    recommendations = []
    for idx in similar_indices:
        if df.iloc[idx]['StockCode'] != product_id:  # Exclude the original product
            recommendations.append((df.iloc[idx]['StockCode'], df.iloc[idx]['Description'], similarity_scores[idx]))
        if len(recommendations) >= num_recommendations:
            break
    return recommendations



In [11]:
# Step 3: Function to recommend products for a specific customer
def recommend_products_for_customer(customer_id, num_recommendations=3):
    """
    Recommend products based on the customer's purchase history.
    """
    # Get products purchased by the customer
    purchased_products = df[df['CustomerID'] == customer_id]['StockCode'].unique()
    if len(purchased_products) == 0:
        return f"No purchases found for CustomerID {customer_id}."
    
    # Get similar products for each purchased product
    all_recommendations = []
    for product_id in purchased_products:
        similar_products = get_similar_products(product_id, num_recommendations)
        all_recommendations.extend(similar_products)
    
    # Deduplicate recommendations and sort by similarity
    unique_recommendations = pd.DataFrame(all_recommendations, columns=['StockCode', 'Description', 'Similarity'])
    unique_recommendations = unique_recommendations.drop_duplicates(subset='StockCode').sort_values(by='Similarity', ascending=False)
    
    # Return top recommendations
    return unique_recommendations.head(num_recommendations)



In [12]:
# Step 4: Input a CustomerID and get recommendations
customer_id_input = float(12348)  # Replace with actual CustomerID for testing
customer_recommendations = recommend_products_for_customer(customer_id_input, num_recommendations=5)


In [13]:

# Display recommendations
if isinstance(customer_recommendations, str):
    print(customer_recommendations)
else:
    print(f"Top Recommendations for CustomerID {customer_id_input}:")
    for i, row in customer_recommendations.iterrows():
        print(f"{i+1}. {row['Description']} (StockCode: {row['StockCode']}, Similarity: {row['Similarity']:.2f})")

Top Recommendations for CustomerID 12348.0:
81. PACK OF 12 PINK PAISLEY TISSUES  (StockCode: 21984, Similarity: 0.86)
41. SET OF 72 RETROSPOT PAPER  DOILIES (StockCode: 21210, Similarity: 0.83)
96. ICE CREAM SUNDAE LIP GLOSS (StockCode: 23076, Similarity: 0.78)
66. PACK OF 12 RED APPLE TISSUES (StockCode: 23379, Similarity: 0.77)
76. HEARTS  STICKERS (StockCode: 21677, Similarity: 0.76)


## Recommendations to a new customer

In [14]:
import pandas as pd

# Step 1: Display Top 10 Products for Each Cluster
print("Top 10 Products for Each Cluster:")
display(top_products_per_cluster)

# Step 2: Recommendation Function
def recommend_products_for_new_customer(cluster_label=None):
    """
    Recommend products for new customers based on cluster behavior.
    If no cluster is specified, recommend top products overall.
    """
    if cluster_label is not None:
        # Recommend top 10 products from the specified cluster
        recommendations = top_products_per_cluster[top_products_per_cluster['Cluster'] == cluster_label][['StockCode', 'Description']].reset_index(drop=True)
    else:
        # Recommend top products across all clusters (most frequent products overall)
        recommendations = (
            top_products_per_cluster.groupby(['StockCode', 'Description'])['Quantity']
            .sum()
            .reset_index()
            .sort_values(by='Quantity', ascending=False)
            .head(10)[['StockCode', 'Description']]
        )

    return recommendations


# Example 1: Get recommendations for a new customer in Cluster 0
print("\nRecommended Products for New Customers in Cluster 0:")
cluster_0_recommendations = recommend_products_for_new_customer(cluster_label=0)
display(cluster_0_recommendations)

# Example 2: Get general recommendations if cluster is unknown
print("\nRecommended Products for New Customers (General):")
general_recommendations = recommend_products_for_new_customer()
display(general_recommendations)


Top 10 Products for Each Cluster:


Unnamed: 0,Cluster,StockCode,Description,Quantity
28,0,16014,SMALL CHINESE STYLE SCISSOR,11613
2780,0,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,8787
1436,0,22616,PACK OF 12 LONDON TISSUES,6965
2996,0,84879,ASSORTED COLOUR BIRD ORNAMENT,6501
3150,0,85099B,JUMBO BAG RED RETROSPOT,6171
11,0,15036,ASSORTED COLOURS SILK FAN,5784
90,0,18007,ESSENTIAL BALM 3.5g TIN IN ENVELOPE,5679
62,0,17003,BROCADE RING PURSE,5443
3164,0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,4443
1029,0,22178,VICTORIAN GLASS HANGING T-LIGHT,4120



Recommended Products for New Customers in Cluster 0:


Unnamed: 0,StockCode,Description
0,16014,SMALL CHINESE STYLE SCISSOR
1,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS
2,22616,PACK OF 12 LONDON TISSUES
3,84879,ASSORTED COLOUR BIRD ORNAMENT
4,85099B,JUMBO BAG RED RETROSPOT
5,15036,ASSORTED COLOURS SILK FAN
6,18007,ESSENTIAL BALM 3.5g TIN IN ENVELOPE
7,17003,BROCADE RING PURSE
8,85123A,WHITE HANGING HEART T-LIGHT HOLDER
9,22178,VICTORIAN GLASS HANGING T-LIGHT



Recommended Products for New Customers (General):


Unnamed: 0,StockCode,Description
20,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS
24,85099B,JUMBO BAG RED RETROSPOT
22,84879,ASSORTED COLOUR BIRD ORNAMENT
25,85123A,WHITE HANGING HEART T-LIGHT HOLDER
6,21212,PACK OF 72 RETROSPOT CAKE CASES
18,23084,RABBIT NIGHT LIGHT
12,22197,POPCORN HOLDER
16,22616,PACK OF 12 LONDON TISSUES
14,22492,MINI PAINT SET VINTAGE
9,21977,PACK OF 60 PINK PAISLEY CAKE CASES


In [15]:
import pandas as pd

# Assuming 'best_selling_products' and 'top_products_per_cluster' are already defined

# Extract the top 3 products for each cluster
top_3_products_per_cluster = (
    best_selling_products.groupby('Cluster')
    .head(3)  # Get the top 3 products for each cluster
    .reset_index(drop=True)
)

# Create a DataFrame with the top 3 products per cluster
recommendation_df = top_3_products_per_cluster[['Cluster', 'StockCode', 'Description', 'Quantity']]

# Rename columns for clarity
recommendation_df.columns = ['Cluster', 'Product Code', 'Product Description', 'Total Quantity Sold']

# Display the DataFrame
print("Top 3 Product Recommendations for Each Cluster:")
display(recommendation_df)


Top 3 Product Recommendations for Each Cluster:


Unnamed: 0,Cluster,Product Code,Product Description,Total Quantity Sold
0,0,16014,SMALL CHINESE STYLE SCISSOR,11613
1,0,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,8787
2,0,22616,PACK OF 12 LONDON TISSUES,6965
3,1,17003,BROCADE RING PURSE,1224
4,1,15036,ASSORTED COLOURS SILK FAN,1124
5,1,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,1104
6,2,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,11912
7,2,84879,ASSORTED COLOUR BIRD ORNAMENT,9260
8,2,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6684
9,3,84077,WORLD WAR 2 GLIDERS ASSTD DESIGNS,636


In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assuming you already have the 'best_selling_products' DataFrame with 'StockCode', 'Description', and 'Quantity'

# Step 1: Vectorize Product Descriptions using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(best_selling_products['Description'])

# Step 2: Calculate cosine similarity between products
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 3: Create a function to recommend top N products based on similarity
def recommend_products_for_new_customer(product_id, top_n=3):
    # Get the index of the product
    idx = best_selling_products.index[best_selling_products['StockCode'] == product_id].tolist()[0]
    
    # Get pairwise similarity scores for all products with the given product
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the products by similarity score (higher is better) and get the top N
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N most similar products
    top_similar_products = sim_scores[1:top_n + 1]  # Skip the first one (it's the product itself)
    
    # Extract product details for recommendations
    recommended_product_ids = [best_selling_products['StockCode'].iloc[i[0]] for i in top_similar_products]
    recommended_descriptions = [best_selling_products['Description'].iloc[i[0]] for i in top_similar_products]
    
    return pd.DataFrame({
        'Recommended Product ID': recommended_product_ids,
        'Recommended Description': recommended_descriptions
    })

# Example: Recommend top 3 products for a new customer, starting from a given product (e.g., '85123A')
recommendations = recommend_products_for_new_customer('85099B', top_n=10)

print("Recommended Products for New Customer:")
print(recommendations)


Recommended Products for New Customer:
  Recommended Product ID           Recommended Description
0                 62074B       ELEPHANT CLIP W SUCTION CUP
1                 62074B       ELEPHANT CLIP W SUCTION CUP
2                  47420  ASSORTED COLOUR SUCTION CUP HOOK
3                  47420  ASSORTED COLOUR SUCTION CUP HOOK
4                  47420  ASSORTED COLOUR SUCTION CUP HOOK
5                  47420  ASSORTED COLOUR SUCTION CUP HOOK
6                  47422  ASSORTED MONKEY SUCTION CUP HOOK
7                  47422  ASSORTED MONKEY SUCTION CUP HOOK
8                  47422  ASSORTED MONKEY SUCTION CUP HOOK
9                  21507         ELEPHANT, BIRTHDAY CARD, 


In [17]:
best_selling_products['StockCode'] = best_selling_products['StockCode'].astype(str)

In [18]:
# Convert all StockCode values to string type
best_selling_products['StockCode'] = best_selling_products['StockCode'].astype(str)

# Function to recommend products for a new customer (ensuring string type handling)
def recommend_products_for_new_customer(product_id, top_n=3):
    # Ensure the input product_id is a string
    product_id = str(product_id)
    
    # Get the index of the product (ensure it's treated as a string)
    idx = best_selling_products.index[best_selling_products['StockCode'] == product_id].tolist()[0]
    
    # Get pairwise similarity scores for all products with the given product
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the products by similarity score (higher is better) and get the top N
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N most similar products (skip the first one, which is the product itself)
    top_similar_products = sim_scores[1:top_n + 1]  # Skip the first one (it's the product itself)
    
    # Extract product details for recommendations
    recommended_product_ids = [best_selling_products['StockCode'].iloc[i[0]] for i in top_similar_products]
    recommended_descriptions = [best_selling_products['Description'].iloc[i[0]] for i in top_similar_products]
    
    # Create DataFrame with unique recommendations
    recommended_df = pd.DataFrame({
        'Recommended Product ID': recommended_product_ids,
        'Recommended Description': recommended_descriptions
    })
    
    # Drop duplicates if any
    recommended_df = recommended_df.drop_duplicates(subset=['Recommended Product ID'])
    
    return recommended_df

# Example: Recommend top 3 products for a new customer, starting from a given product (e.g., '85123A')
recommendations = recommend_products_for_new_customer('85099B', top_n=2)

print("Recommended Products for New Customer:")
print(recommendations)


Recommended Products for New Customer:
  Recommended Product ID      Recommended Description
0                 62074B  ELEPHANT CLIP W SUCTION CUP
