In [44]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import euclidean

In [45]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')

# Merge datasets
transactions_products = pd.merge(transactions, products, on='ProductID', how='left')
merged_data = pd.merge(transactions_products, customers, on='CustomerID', how='left')


In [46]:
customer_features = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum'),
    unique_categories=('Category', lambda x: x.nunique()),
    unique_products=('ProductID', lambda x: x.nunique()),
    regions=('Region', 'first')
).reset_index()


In [47]:
customer_features = pd.get_dummies(customer_features, columns=['regions'], drop_first=True)


In [48]:

scaler = StandardScaler()
features_scaled = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

In [49]:
cos_sim_matrix = cosine_similarity(features_scaled)
euclidean_distances = np.array([[euclidean(x, y) for y in features_scaled] for x in features_scaled])
euclidean_sim_matrix = 1 / (1 + euclidean_distances)


In [50]:
lookalike_list = []
for i, customer_id in enumerate(customer_features['CustomerID'][:20]):  # Consider the first 20 customers
    # Cosine similarity
    cos_sim_scores = [(customer_features['CustomerID'][j], cos_sim_matrix[i][j]) for j in range(len(customer_features))]
    cos_sim_scores = sorted(cos_sim_scores, key=lambda x: x[1], reverse=True)
    
    euc_sim_scores = [(customer_features['CustomerID'][j], euclidean_sim_matrix[i][j]) for j in range(len(customer_features))]
    euc_sim_scores = sorted(euc_sim_scores, key=lambda x: x[1], reverse=True)
    
    combined_scores = []
    for j in range(len(customer_features)):
        if customer_features['CustomerID'][j] != customer_id:  # Exclude self
            combined_score = (cos_sim_matrix[i][j] + euclidean_sim_matrix[i][j]) / 2  # Average of both scores
            combined_scores.append((customer_features['CustomerID'][j], combined_score))
    combined_scores = sorted(combined_scores, key=lambda x: x[1], reverse=True)
    
    for k in range(3):
        lookalike_list.append({
            "cust_id": customer_id,
            "similar_cust_id": combined_scores[k][0],
            "combined_similarity": combined_scores[k][1],
            "cosine_similarity": cos_sim_scores[k][1],
            "euclidean_similarity": euc_sim_scores[k][1]
        })

# Save the results to Lookalike.csv
lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv created ")


Lookalike.csv created 
