In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv('../data/Customers.csv')
products = pd.read_csv('../data/Products.csv')
transactions = pd.read_csv('../data/Transactions.csv')

# Merge datasets
data = pd.merge(transactions, customers, on='CustomerID')
data = pd.merge(data, products, on='ProductID')

# Resolve duplicate columns
# Use 'Price_y' and drop 'Price_x'
data = data.rename(columns={'Price_y': 'Price'}).drop(columns=['Price_x'])

# Verify column names
print(data.columns)

# Prepare data for similarity
customer_profiles = data.groupby('CustomerID')[['Price', 'Quantity', 'TotalValue']].sum()

# Normalize data
customer_profiles_normalized = (customer_profiles - customer_profiles.min()) / (customer_profiles.max() - customer_profiles.min())

# Compute similarity
similarity_matrix = cosine_similarity(customer_profiles_normalized)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles.index, columns=customer_profiles.index)

# Find top 3 similar customers for CustomerID C0001 - C0020
lookalikes = {}
for customer_id in customer_profiles.index[:20]:  # First 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude self
    lookalikes[customer_id] = list(similar_customers.items())

# Convert lookalikes dictionary to DataFrame
lookalikes_df = pd.DataFrame.from_dict(lookalikes, orient='index', columns=['Customer1', 'Customer2', 'Customer3'])

# Save results
lookalikes_df.to_csv('../outputs/FirstName_LastName_Lookalike.csv', index=True)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'CustomerName', 'Region', 'SignupDate',
       'ProductName', 'Category', 'Price'],
      dtype='object')
