In [1]:
# Load data
import pandas as pd

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge data
merged = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [4]:
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: list(x.unique())
}).rename(columns={
    'TotalValue': 'TotalSpending',
    'TransactionID': 'NumTransactions'
})


In [5]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Encode product categories into a bag-of-words representation
vectorizer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
category_matrix = vectorizer.fit_transform(customer_features['Category'])

# Combine numeric and categorical features
numeric_features = customer_features[['TotalSpending', 'NumTransactions']].values
combined_features = cosine_similarity(numeric_features) + cosine_similarity(category_matrix)

# Get top 3 similar customers for each customer
import numpy as np
lookalike = {}
for i, customer_id in enumerate(customer_features.index):
    similarities = combined_features[i]
    similar_indices = np.argsort(similarities)[::-1][1:4]  # Skip self
    similar_customers = [(customer_features.index[idx], similarities[idx]) for idx in similar_indices]
    lookalike[customer_id] = similar_customers




In [7]:
# Create a DataFrame
lookalike_df = pd.DataFrame([
    {
        'CustomerID': customer_id,
        'Lookalikes': ', '.join([f"({cust}, {score:.2f})" for cust, score in lookalike_list])
    }
    for customer_id, lookalike_list in lookalike.items()
])

# Save as CSV
lookalike_df.to_csv('Shivani_Channagoudra_Lookalike.csv', index=False)
