In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from difflib import get_close_matches

In [2]:
#  Load Dataset
df = pd.read_csv("online_retail.csv", parse_dates=["InvoiceDate"])

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2022-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2022-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID
count,541909.0,541909,541909.0,406829.0
mean,9.55225,2023-07-04 13:34:57.156387072,4.611114,15287.69057
min,-80995.0,2022-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2023-03-28 11:34:00,1.25,13953.0
50%,3.0,2023-07-19 17:17:00,2.08,15152.0
75%,10.0,2023-10-19 11:27:00,4.13,16791.0
max,80995.0,2023-12-09 12:50:00,38970.0,18287.0
std,218.081158,,96.759853,1713.600303


In [5]:
#  Data Cleaning
df.dropna(subset=["CustomerID", "Description"], inplace=True)
df = df[~df["InvoiceNo"].astype(str).str.startswith("C")]
df = df[(df["Quantity"] > 0) & (df["UnitPrice"] > 0)]
df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]

In [6]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2022-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2022-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [7]:
df.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID,TotalPrice
count,397884.0,397884,397884.0,397884.0,397884.0
mean,12.988238,2023-07-10 23:41:23.511022592,3.116488,15294.423453,22.397
min,1.0,2022-12-01 08:26:00,0.001,12346.0,0.001
25%,2.0,2023-04-07 11:12:00,1.25,13969.0,4.68
50%,6.0,2023-07-31 14:39:00,1.95,15159.0,11.8
75%,12.0,2023-10-20 14:33:00,3.75,16795.0,19.8
max,80995.0,2023-12-09 12:50:00,8142.75,18287.0,168469.6
std,179.331775,,22.097877,1713.14156,309.071041


In [8]:
#  RFM Feature Engineering
latest_date = df["InvoiceDate"].max()
rfm = df.groupby("CustomerID").agg({
    "InvoiceDate": lambda x: (latest_date - x.max()).days,
    "InvoiceNo": "nunique",
    "TotalPrice": "sum"
}).reset_index()
rfm.columns = ["CustomerID", "Recency", "Frequency", "Monetary"]


In [9]:
#  Standardize RFM values
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[["Recency", "Frequency", "Monetary"]])

In [10]:
#  KMeans Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
rfm["Cluster"] = kmeans.fit_predict(rfm_scaled)


In [11]:
#  Segment Labelling
def label_segment(row):
    if row["Recency"] < rfm["Recency"].quantile(0.25) and \
       row["Frequency"] > rfm["Frequency"].quantile(0.75) and \
       row["Monetary"] > rfm["Monetary"].quantile(0.75):
        return "High-Value"
    elif row["Frequency"] > rfm["Frequency"].median():
        return "Regular"
    elif row["Recency"] > rfm["Recency"].quantile(0.75):
        return "At-Risk"
    else:
        return "Occasional"

rfm["Segment"] = rfm.apply(label_segment, axis=1)

In [18]:
#  Save Models
joblib.dump(kmeans, "kmeans_model.pkl")
joblib.dump(scaler, "scaler.pkl")
rfm.to_csv("rfm_clusters.csv", index=False)
df.to_csv("cleaned_data.csv",index=False)

In [13]:
#  Collaborative Filtering Preparation
user_item_matrix = df.pivot_table(index="CustomerID", columns="StockCode", values="Quantity", fill_value=0)
product_similarity = cosine_similarity(user_item_matrix.T)
sim_df = pd.DataFrame(product_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)
sim_df.to_csv("product_similarity.csv")

In [14]:
#  Mapping: StockCode ↔ Product Name
product_map = df[["StockCode", "Description"]].drop_duplicates().dropna()
code_to_name = dict(zip(product_map["StockCode"], product_map["Description"]))
name_to_code = {v: k for k, v in code_to_name.items()}

In [15]:
#  Helper Function: Get Closest Matching Name
def get_closest_product_name(input_name):
    all_names = list(name_to_code.keys())
    match = get_close_matches(input_name, all_names, n=1, cutoff=0.6)
    return match[0] if match else None


In [16]:
#  Function: Recommend Based on Product Name
def get_top_5_similar_by_name(product_name):
    match_name = get_closest_product_name(product_name)
    if not match_name:
        return ["No close match found for the product name."]
    
    product_code = name_to_code.get(match_name)
    if not product_code or product_code not in sim_df.columns:
        return ["Product not found in similarity matrix."]
    
    top5_codes = sim_df[product_code].sort_values(ascending=False)[1:6].index.tolist()
    top5_names = [code_to_name.get(code, f"Unknown Product ({code})") for code in top5_codes]
    
    return [f"{i+1}. {name}" for i, name in enumerate(top5_names)]


In [17]:
#  Example Usage
product_input = "white hanging heart t-light holder"
recommendations = get_top_5_similar_by_name(product_input.upper())
print(f"\nTop 5 similar products to '{product_input.title()}':")
for rec in recommendations:
    print(rec)


Top 5 similar products to 'White Hanging Heart T-Light Holder':
1. BLACK AND WHITE CAT BOWL
2. DAISY GARDEN MARKER
3. IF YOU CAN'T STAND THE HEAT MUG
4. PANTRY CHOPPING BOARD
5. DOORSTOP RETROSPOT HEART
