In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

customers_file = "/content/Customers.csv"
transactions_file = "/content/Transactions - Transactions.csv"

customers_df = pd.read_csv(customers_file)
transactions_df = pd.read_csv(transactions_file)

In [2]:
customer_transactions = transactions_df.merge(customers_df, on="CustomerID")

# Aggregate transaction data per customer (Total Spend, Average Spend, Number of Purchases)
customer_summary = customer_transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_spent=("TotalValue", "mean"),
    num_purchases=("TransactionID", "count")
).reset_index()

In [3]:
# Normalize data for similarity calculations
scaler = StandardScaler()
customer_summary_scaled = scaler.fit_transform(customer_summary.iloc[:, 1:])

similarity_matrix = cosine_similarity(customer_summary_scaled)

lookalike_results = {}
customer_ids = customer_summary["CustomerID"].values

for i in range(20):
    customer_id = customer_ids[i]
    similarity_scores = similarity_matrix[i]
    similar_customers = np.argsort(similarity_scores)[::-1][1:4]
    lookalike_results[customer_id] = [(customer_ids[j], round(similarity_scores[j], 3)) for j in similar_customers]


In [4]:
lookalike_df = pd.DataFrame(lookalike_results.items(), columns=["CustomerID", "Lookalikes"])
lookalike_df.to_csv("Lookalike.csv", index=False)

print(lookalike_df.head())

  CustomerID                                        Lookalikes
0      C0001  [(C0137, 0.999), (C0152, 0.996), (C0121, 0.993)]
1      C0002    [(C0029, 1.0), (C0199, 0.999), (C0010, 0.999)]
2      C0003      [(C0005, 1.0), (C0178, 1.0), (C0144, 0.999)]
3      C0004      [(C0067, 1.0), (C0021, 1.0), (C0075, 0.999)]
4      C0005    [(C0003, 1.0), (C0073, 0.999), (C0063, 0.999)]
