In [1]:
import pandas as pd

# Load datasets
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

In [2]:
# Merge transactions with customers and products
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID', how='left')
merged_df = pd.merge(merged_df, products_df, on='ProductID', how='left')

In [3]:
# Group by CustomerID and aggregate features
customer_features = merged_df.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',    # Total quantity purchased
    'TransactionID': 'count',  # Number of transactions
    'Region': 'first'  # Region of the customer
}).reset_index()

In [4]:
# One-hot encode the 'Region' column
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

In [6]:
from sklearn.preprocessing import StandardScaler

# Normalize numerical features
scaler = StandardScaler()
customer_features[['TotalValue', 'Quantity', 'TransactionID']] = scaler.fit_transform(
    customer_features[['TotalValue', 'Quantity', 'TransactionID']]
)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:])  # Exclude CustomerID

In [8]:
def get_top_similar_customers(customer_id, similarity_matrix, top_n=3):
    # Find the index of the customer
    customer_index = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    
    # Get similarity scores for the customer
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    
    # Sort by similarity score (descending order)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Exclude the customer itself and get top N similar customers
    top_similar_customers = similarity_scores[1:top_n+1]
    
    # Return CustomerID and similarity score
    return [(customer_features.iloc[i]['CustomerID'], score) for i, score in top_similar_customers]

In [9]:
# Initialize a dictionary to store results
lookalike_map = {}

# Get the first 20 customers
first_20_customers = customer_features['CustomerID'].head(20)

# Generate recommendations for each customer
for customer_id in first_20_customers:
    lookalike_map[customer_id] = get_top_similar_customers(customer_id, similarity_matrix)

In [10]:
# Convert the dictionary to a DataFrame
lookalike_df = pd.DataFrame(lookalike_map.items(), columns=['CustomerID', 'Lookalikes'])

# Save to CSV
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)