# Recommendation Modle

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import altair
from sklearn.metrics.pairwise import cosine_similarity
import joblib

### Load dataset

In [2]:
df = pd.read_csv("../data/processed/Recommendation/Recommendation_model.csv")

In [3]:
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice,InvoiceDateOnly,InvoiceDayOfWeek,InvoiceMonth,InvoiceHour
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2022-12-01 08:26:00,2.55,17850.0,United Kingdom,15.30,2022-12-01,Thursday,December,8
1,536365,71053,WHITE METAL LANTERN,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,2022-12-01,Thursday,December,8
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2022-12-01 08:26:00,2.75,17850.0,United Kingdom,22.00,2022-12-01,Thursday,December,8
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,2022-12-01,Thursday,December,8
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,2022-12-01,Thursday,December,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
397879,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2023-12-09 12:50:00,0.85,12680.0,France,10.20,2023-12-09,Saturday,December,12
397880,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2023-12-09 12:50:00,2.10,12680.0,France,12.60,2023-12-09,Saturday,December,12
397881,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2023-12-09 12:50:00,4.15,12680.0,France,16.60,2023-12-09,Saturday,December,12
397882,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2023-12-09 12:50:00,4.15,12680.0,France,16.60,2023-12-09,Saturday,December,12


# Drop missing values and canceled orders

In [4]:
df.dropna(subset=['CustomerID', 'Description'], inplace=True)
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

### Check for null values

In [5]:
df.isna().sum()

InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID          0
Country             0
TotalPrice          0
InvoiceDateOnly     0
InvoiceDayOfWeek    0
InvoiceMonth        0
InvoiceHour         0
dtype: int64

### Strip extra whitespace from descriptions

In [6]:
df['Description'] = df['Description'].str.strip().str.upper()

## Train/Test Split 

In [12]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
split_date = df['InvoiceDate'].quantile(0.8)

train_df = df[df['InvoiceDate'] <= split_date]
test_df = df[df['InvoiceDate'] > split_date]

## Create User–Item Matrix (Frequency or Binary)

In [13]:
# Create the user-item matrix using Quantity
basket = train_df.pivot_table(index='CustomerID', columns='Description', values='Quantity', aggfunc='sum').fillna(0)

# filter rarely purchased products
product_counts = (basket > 0).sum()
basket = basket.loc[:, product_counts[product_counts >= 3].index]

## Apply TF-IDF Weighting

In [14]:
# TF = original basket
tf = basket.copy()

# IDF: log(N / df)
n_customers = tf.shape[0]
df_counts = (tf > 0).sum(axis=0)
idf = np.log1p(n_customers / (1 + df_counts))

# TF-IDF weighting
tfidf_matrix = tf * idf

## Compute Cosine Similarity

In [15]:
# Use either TF-IDF matrix or raw quantity matrix
item_similarity = cosine_similarity(tfidf_matrix.T)  # or basket.T for frequency-based
item_similarity_df = pd.DataFrame(item_similarity, index=tf.columns, columns=tf.columns)

## Save model


In [16]:
joblib.dump(item_similarity_df, '../models/item_similarity_model.pkl')

['../models/item_similarity_model.pkl']

## Recommendation Function

In [17]:
def recommend_products(product_name, similarity_df=item_similarity_df, top_n=5):
    product_name = product_name.strip().upper()

    if product_name not in similarity_df.columns:
        return ["❌ Product not found."]
    
    similar_items = similarity_df[product_name].sort_values(ascending=False)
    return similar_items.iloc[1:top_n+1].index.tolist()

##  Offline Evaluation (Hit Rate @ N)

In [18]:
def hit_rate(similarity_df, train_df, test_df, n=5):
    test_users = test_df['CustomerID'].unique()
    hits = 0
    total = 0

    for user in test_users:
        test_products = test_df[test_df['CustomerID'] == user]['Description'].unique()
        train_products = train_df[train_df['CustomerID'] == user]['Description'].unique()

        for train_product in train_products:
            if train_product not in similarity_df.columns:
                continue
            recommended = similarity_df[train_product].sort_values(ascending=False).iloc[1:n+1].index.tolist()
            for test_product in test_products:
                if test_product in recommended:
                    hits += 1
                    break
            total += 1

    hit_rate_value = hits / total if total else 0
    print(f"Hit Rate @ {n}: {hit_rate_value:.4f}")
    return hit_rate_value

# Evaluate the model

In [20]:
hit_rate(item_similarity_df, train_df, test_df, n=5)

Hit Rate @ 5: 0.1976


0.19760792432790217

In [19]:
print(recommend_products("WHITE HANGING HEART T-LIGHT HOLDER"))

['GIN + TONIC DIET METAL SIGN', 'FAIRY CAKE FLANNEL ASSORTED COLOUR', 'DOORMAT FAIRY CAKE', 'RED HANGING HEART T-LIGHT HOLDER', 'WASHROOM METAL SIGN']


# Summary - 
### In this project, we developed a complete item-based collaborative filtering recommendation system using transactional e-commerce data. After cleaning and preprocessing the dataset, we created a customer-product interaction matrix using product purchase quantities. We applied optional enhancements such as filtering out rarely purchased items and implementing TF-IDF weighting to reduce the influence of overly popular products. Cosine similarity was then used to compute product-to-product similarity, forming the basis for recommendations. A recommendation function was built to return the top-N similar products for any given input item. To evaluate model performance, we implemented a time-based train/test split and calculated Hit Rate @ N, which measures how often actual future purchases appeared in the top-N recommended products. This pipeline provides both a functional recommendation engine and an evaluation framework, suitable for integration into applications like a Streamlit-based e-commerce platform.