In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer


customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

mlb = MultiLabelBinarizer()
data['CategoryList'] = data['Category'].apply(lambda x: [x])  # Wrap each category in a list
category_matrix = mlb.fit_transform(data['CategoryList'])
category_df = pd.DataFrame(category_matrix, columns=mlb.classes_)


data_with_category = data[['CustomerID']].join(category_df)
data_grouped = data_with_category.groupby('CustomerID').sum()

cos_sim = cosine_similarity(data_grouped)


lookalikes = {}
for i in range(20): 
    customer_id = f'C{i+1:04d}'  
    customer_idx = data_grouped.index.get_loc(customer_id)
    sim_scores = cos_sim[customer_idx]
    
    similar_customers = [(data_grouped.index[j], sim_scores[j]) for j in range(len(sim_scores)) if data_grouped.index[j] != customer_id]
    similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]  # Top 3 most similar
    
    
    lookalikes[customer_id] = similar_customers

lookalike_list = []
for customer_id, similar_customers in lookalikes.items():
    for sim_customer, score in similar_customers:
        lookalike_list.append([customer_id, sim_customer, score])

lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'LookalikeID', 'SimilarityScore'])


lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike model generated and saved to 'Lookalike.csv'.")
