# Task 2: Lookalike Model

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
# Merge transactions with products and customers
transactions_products = transactions.merge(products, on="ProductID")
full_data = transactions_products.merge(customers, on="CustomerID")

In [None]:
# Feature engineering: Aggregate data for customer profiles
customer_profiles = full_data.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    TransactionCount=('TransactionID', 'count'),
    AvgTransactionValue=('TotalValue', 'mean'),
    CategoryBooks=('Category', lambda x: (x == 'Books').sum()),
    CategoryElectronics=('Category', lambda x: (x == 'Electronics').sum()),
    CategoryClothing=('Category', lambda x: (x == 'Clothing').sum()),
    CategoryHomeDecor=('Category', lambda x: (x == 'Home Decor').sum())
).reset_index()

In [None]:
# Normalize numerical features for similarity computation
features = ['TotalSpending', 'TransactionCount', 'AvgTransactionValue',
            'CategoryBooks', 'CategoryElectronics', 'CategoryClothing', 'CategoryHomeDecor']
customer_profiles_normalized = customer_profiles.copy()
customer_profiles_normalized[features] = (
    customer_profiles[features] - customer_profiles[features].mean()
) / customer_profiles[features].std()

In [None]:
# Compute similarity scores using cosine similarity
customer_vectors = customer_profiles_normalized[features].values
similarity_matrix = cosine_similarity(customer_vectors)


In [None]:
# Extract top 3 similar customers for the first 20 customers
lookalikes = {}
for i, cust_id in enumerate(customer_profiles['CustomerID'][:20]):
    similarities = list(enumerate(similarity_matrix[i]))
    top_similar = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]
    lookalikes[cust_id] = [(customer_profiles['CustomerID'][j], round(score, 4)) for j, score in top_similar]

In [None]:
# Convert to DataFrame for export
lookalike_df = pd.DataFrame({
    'CustomerID': lookalikes.keys(),
    'Lookalikes': [str(v) for v in lookalikes.values()]
})

In [None]:
# Save the lookalike data to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

lookalike_csv_path