In [11]:
from sklearn.metrics.pairwise import cosine_similarity

customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

merged_df = transactions.merge(customers, on='CustomerID', how='inner').merge(products, on='ProductID', how='inner')

customer_features = merged_df.groupby('CustomerID').agg({
'TotalValue': 'sum',
'Quantity': 'sum'
}).reset_index()

customer_features = customer_features.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

similarity_matrix = cosine_similarity(scaled_features)

customer_ids = customer_features['CustomerID'].tolist()
top_lookalikes = {}

for i, cust_id in enumerate(customer_ids[:20]):
    similar_indices = similarity_matrix[i].argsort()[::-1][1:4]
    top_lookalikes[cust_id] = [(customer_ids[j], round(float(similarity_matrix[i][j]), 4)) for j in similar_indices]

lookalike_data = {cust_id: top_lookalikes[cust_id] for cust_id in top_lookalikes}

with open('Lookalike.csv', 'w') as f:
    f.write("Map<cust_id, List<cust_id, score>>\n")
    for cust_id, lookalikes in lookalike_data.items():
        f.write(f"{cust_id}, {lookalikes}\n")

print(lookalike_data)

{'C0001': [('C0107', 0.9964), ('C0137', 0.9957), ('C0184', 0.9956)], 'C0002': [('C0088', 0.9961), ('C0142', 0.9882), ('C0159', 0.9732)], 'C0003': [('C0147', 0.9977), ('C0190', 0.997), ('C0174', 0.982)], 'C0004': [('C0113', 0.9943), ('C0102', 0.9792), ('C0169', 0.9785)], 'C0005': [('C0186', 0.997), ('C0159', 0.9965), ('C0140', 0.9912)], 'C0006': [('C0048', 0.9934), ('C0126', 0.9911), ('C0187', 0.9903)], 'C0007': [('C0146', 1.0), ('C0178', 0.9944), ('C0177', 0.9936)], 'C0008': [('C0018', 0.9837), ('C0122', 0.9613), ('C0046', 0.955)], 'C0009': [('C0198', 1.0), ('C0014', 0.9966), ('C0063', 0.994)], 'C0010': [('C0019', 0.9905), ('C0073', 0.9846), ('C0166', 0.9825)], 'C0011': [('C0107', 0.9951), ('C0048', 0.9949), ('C0001', 0.9887)], 'C0012': [('C0148', 0.9958), ('C0163', 0.9936), ('C0155', 0.9891)], 'C0013': [('C0163', 0.9963), ('C0148', 0.9941), ('C0155', 0.9916)], 'C0014': [('C0060', 0.9994), ('C0009', 0.9966), ('C0198', 0.9962)], 'C0015': [('C0020', 0.9987), ('C0058', 0.9896), ('C0033', 