In [13]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the CSV files
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge the data to create a customer profile
merged_data = pd.merge(transactions, products, on='ProductID', how='inner')
merged_data = pd.merge(merged_data, customers[['CustomerID', 'Region']], on='CustomerID', how='inner')

# Aggregate data by customer
customer_profile = merged_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    purchase_count=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()

# Add the most frequent category to the profile
customer_profile['most_frequent_category'] = merged_data.groupby('CustomerID')['Category'].apply(lambda x: x.mode()[0]).reset_index(drop=True)

# Add customer region to the profile
customer_profile = pd.merge(customer_profile, customers[['CustomerID', 'Region']], on='CustomerID')

# Features to consider for similarity computation
features = ['total_spent', 'purchase_count', 'avg_transaction_value']

# Handle missing values by filling NaN with the mean of the column
customer_profile[features] = customer_profile[features].fillna(customer_profile[features].mean())

# Standardize the features to improve cosine similarity computation
scaler = StandardScaler()
X = scaler.fit_transform(customer_profile[features])

# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(X)

# Prepare the result for the first 20 customers (C0001 - C0020)
top_lookalikes = {}

for i in range(20):  # For customers C0001 to C0020
    customer_id = f'C{i+1:04d}'  # Generate customer ID like C0001, C0002, ...
    similarity_scores = similarity_matrix[i]  # Similarity scores for the current customer
    
    # Pair up customers with their similarity score
    similar_customers = [(customer_profile['CustomerID'][j], similarity_scores[j]) for j in range(len(similarity_scores)) if customer_profile['CustomerID'][j] != customer_id]
    
    # Sort similar customers based on similarity score and get the top 3
    similar_customers.sort(key=lambda x: x[1], reverse=True)
    top_lookalikes[customer_id] = similar_customers[:3]

# Save the lookalike data to a CSV file
lookalike_data = []
for customer_id, lookalikes in top_lookalikes.items():
    for similar_customer, score in lookalikes:
        lookalike_data.append({'CustomerID': customer_id, 'LookalikeCustomerID': similar_customer, 'SimilarityScore': score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv has been created.")


Lookalike.csv has been created.
