Import Essential Datasets

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


Load Datasets

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

Merge Datasets

In [3]:
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

Aggregate data for customer profiles

In [8]:
customer_profiles = data.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "Category": lambda x: x.mode()[0] 
}).reset_index()

One-hot encode, Standardize numerical features & Calculate Cosine similarity

In [None]:
customer_profiles = pd.get_dummies(customer_profiles, columns=["Category"], drop_first=True)
scaler = StandardScaler()
customer_profiles_scaled = scaler.fit_transform(customer_profiles.iloc[:, 1:])
similarity_matrix = cosine_similarity(customer_profiles_scaled)

Top 3 similar customers for each customer

In [9]:
lookalike_results = {}
for idx, customer_id in enumerate(customer_profiles["CustomerID"]):
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]  # Top 3 excluding itself
    similar_customers = customer_profiles.iloc[similar_indices]["CustomerID"].values
    similarity_scores = similarity_matrix[idx][similar_indices]
    lookalike_results[customer_id] = list(zip(similar_customers, similarity_scores))

Filter for first 20 customers

In [11]:
filtered_results = {cust_id: lookalike_results[cust_id] for cust_id in customer_profiles["CustomerID"][:20]}

Save as CSV file

In [13]:
output = []
for cust_id, similar_list in filtered_results.items():
    for similar_cust_id, score in similar_list:
        output.append([cust_id, similar_cust_id, score])
lookalike_df = pd.DataFrame(output, columns=["CustomerID", "SimilarCustomerID", "SimilarityScore"])
lookalike_df.to_csv("AngaraVenkataSaiSingu_Rishik_Lookalike.csv", index=False)