In [1]:
import pandas as pd

In [3]:
customers = pd.read_csv(r'C:\Users\Siddhika\Downloads\Customers.csv')

In [5]:
products = pd.read_csv(r'C:\Users\Siddhika\Downloads\Products.csv')

In [7]:
transactions = pd.read_csv(r'C:\Users\Siddhika\Downloads\Transactions.csv')

In [9]:
transactions_products = transactions.merge(products, on="ProductID", how="left")

In [11]:
merged_data = transactions_products.merge(customers, on="CustomerID", how="left")

In [13]:
customer_spending = merged_data.groupby("CustomerID")["TotalValue"].sum().reset_index()
customer_spending.rename(columns={"TotalValue": "TotalSpending"}, inplace=True)

In [15]:
favorite_category = (
    merged_data.groupby(["CustomerID", "Category"])["Quantity"].sum()
    .reset_index()
    .sort_values(by=["CustomerID", "Quantity"], ascending=[True, False])
)
favorite_category = favorite_category.groupby("CustomerID").first().reset_index()
favorite_category.rename(columns={"Category": "FavoriteCategory"}, inplace=True)

In [17]:
customer_features = customers.merge(customer_spending, on="CustomerID", how="left")
customer_features = customer_features.merge(
    favorite_category[["CustomerID", "FavoriteCategory"]], on="CustomerID", how="left"
)

In [19]:
customer_features["TotalSpending"] = customer_features["TotalSpending"].fillna(0)
customer_features["FavoriteCategory"] = customer_features["FavoriteCategory"].fillna("Unknown")

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
import numpy as np

In [27]:
encoder = OneHotEncoder(sparse_output=False)
encoded_categories = encoder.fit_transform(customer_features[["FavoriteCategory"]])

In [29]:
numerical_features = customer_features[["TotalSpending"]].values
customer_data = np.hstack((numerical_features, encoded_categories))

In [31]:
similarity_matrix = cosine_similarity(customer_data)

In [33]:
customer_ids = customer_features["CustomerID"].values
top_customers = {}

In [35]:
for i in range(20):  # First 20 customers
    customer_id = customer_ids[i]
    # Get similarity scores for the current customer
    similarities = similarity_matrix[i]
    # Exclude the customer itself and get top 3 similar customers
    similar_indices = similarities.argsort()[::-1][1:4]
    similar_customers = [(customer_ids[j], similarities[j]) for j in similar_indices]
    top_customers[customer_id] = similar_customers


In [37]:
lookalike_data = [
    {"CustomerID": cust_id, "Lookalikes": str(recs)}
    for cust_id, recs in top_customers.items()
]

lookalike_df = pd.DataFrame(lookalike_data)

In [39]:
lookalike_df.to_csv("Lookalike.csv", index=False)
print("Lookalike Model completed. Results saved to 'Lookalike.csv'.")

Lookalike Model completed. Results saved to 'Lookalike.csv'.
