In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load data
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

# Merge data
merged_data = pd.merge(transactions, customers, on="CustomerID")
merged_data = pd.merge(merged_data, products, on="ProductID", how="left") 

print("Columns in merged_data:", merged_data.columns)


if 'Price' in merged_data.columns:
    agg_dict = {
        'TotalValue': 'sum',
        'Quantity': 'sum',
        'Price': 'mean',
        'Region': 'first',
    }
else:
    print("Column 'Price' not found. Adjusting aggregation logic...")
    agg_dict = {
        'TotalValue': 'sum',
        'Quantity': 'sum',
        'Region': 'first',
    }

customer_features = merged_data.groupby('CustomerID').agg(agg_dict).reset_index()

print("Customer Features:\n", customer_features.head())

if 'Region' in customer_features.columns:
    customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

scaler = StandardScaler()
numeric_features = customer_features.iloc[:, 1:]
normalized_features = scaler.fit_transform(numeric_features)

similarity_matrix = cosine_similarity(normalized_features)

def get_top_lookalikes(customer_index, top_n=3):
    """Find the top N lookalike customers for a given customer index."""
    similarity_scores = similarity_matrix[customer_index]
    top_indices = similarity_scores.argsort()[::-1][1:top_n+1]
    return [(customer_features.iloc[i]['CustomerID'], round(similarity_scores[i], 2)) for i in top_indices]

lookalike_results = {}
for idx in range(min(20, len(customer_features))):
    customer_id = customer_features.iloc[idx]['CustomerID']
    lookalike_results[customer_id] = get_top_lookalikes(idx)

lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalikes": lookalikes}
    for cust_id, lookalikes in lookalike_results.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike Results:\n", lookalike_df.head())


Columns in merged_data: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')
Column 'Price' not found. Adjusting aggregation logic...
Customer Features:
   CustomerID  TotalValue  Quantity         Region
0      C0001     3354.52        12  South America
1      C0002     1862.74        10           Asia
2      C0003     2725.38        14  South America
3      C0004     5354.88        23  South America
4      C0005     2034.24         7           Asia
Lookalike Results:
   CustomerID                                     Lookalikes
0      C0001     [(C0107, 1.0), (C0137, 1.0), (C0184, 1.0)]
1      C0002   [(C0088, 1.0), (C0142, 0.99), (C0159, 0.97)]
2      C0003    [(C0147, 1.0), (C0190, 1.0), (C0174, 0.98)]
3      C0004  [(C0113, 0.99), (C0102, 0.98), (C0169, 0.98)]
4      C0005    [(C0186, 1.0), (C0159, 1.0), (C0140, 0.99