In [1]:
import pandas as pd

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [2]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [3]:
customer_profiles = customers[['CustomerID', 'Region']]
customer_profiles = pd.get_dummies(customer_profiles, columns=['Region'])

In [4]:
current_date = transactions['TransactionDate'].max()
customers['SignupRecency'] = (current_date - customers['SignupDate']).dt.days
customer_profiles = customer_profiles.merge(customers[['CustomerID', 'SignupRecency']], on='CustomerID')

In [5]:
customer_spend = transactions.groupby('CustomerID')['TotalValue'].sum().reset_index()
customer_spend.rename(columns={'TotalValue': 'TotalSpend'}, inplace=True)

In [6]:
customer_frequency = transactions.groupby('CustomerID').size().reset_index(name='PurchaseFrequency')

In [7]:
customer_avg_order = transactions.groupby('CustomerID')['TotalValue'].mean().reset_index()
customer_avg_order.rename(columns={'TotalValue': 'AvgOrderValue'}, inplace=True)

In [8]:
transactions_products = transactions.merge(products[['ProductID', 'Category']], on='ProductID')

In [9]:
customer_categories = transactions_products.groupby(['CustomerID', 'Category']).size().unstack(fill_value=0)
customer_categories = customer_categories.reset_index()

In [10]:
# Start with customer_profiles
customer_features = customer_profiles.copy()

# Merge transaction features
customer_features = customer_features.merge(customer_spend, on='CustomerID', how='left')
customer_features = customer_features.merge(customer_frequency, on='CustomerID', how='left')
customer_features = customer_features.merge(customer_avg_order, on='CustomerID', how='left')

# Merge categories
customer_features = customer_features.merge(customer_categories, on='CustomerID', how='left')

# Replace NaN values with zeros
customer_features.fillna(0, inplace=True)

# Set CustomerID as index
customer_features.set_index('CustomerID', inplace=True)

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
customer_features_scaled = pd.DataFrame(scaler.fit_transform(customer_features), columns=customer_features.columns, index=customer_features.index)

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the similarity matrix
similarity_matrix = pd.DataFrame(cosine_similarity(customer_features_scaled), index=customer_features_scaled.index, columns=customer_features_scaled.index)

In [13]:
# Function to get top N similar customers
def get_top_n_similar(customers_list, similarity_matrix, n=3):
    top_n_similars = {}
    for customer_id in customers_list:
        # Get similarity scores for the customer
        sim_scores = similarity_matrix.loc[customer_id]
        # Exclude self-similarity
        sim_scores = sim_scores.drop(customer_id)
        # Get top N similar customers
        top_n = sim_scores.nlargest(n)
        # Store in dictionary
        top_n_similars[customer_id] = list(zip(top_n.index, top_n.values))
    return top_n_similars

# Get top 3 similar customers for CustomerID C0001 - C0020
customer_ids = ['C{:04d}'.format(i) for i in range(1, 201)]
top_similars = get_top_n_similar(customer_ids, similarity_matrix, n=3)

In [19]:
# Prepare data for output
output_data = []

for cust_id, similars in lookalike_map.items():
    entry = {'CustomerID': cust_id}
    for i, (similar_cust_id, score) in enumerate(similars):
        entry[f'SimilarCustomerID{i+1}'] = similar_cust_id
        entry[f'SimilarityScore{i+1}'] = score
    output_data.append(entry)

# Create DataFrame
lookalike_df = pd.DataFrame(output_data)

# Fill NaN values with empty strings or zeros
lookalike_df.fillna({'SimilarCustomerID1': '', 'SimilarityScore1': 0,
                     'SimilarCustomerID2': '', 'SimilarityScore2': 0,
                     'SimilarCustomerID3': '', 'SimilarityScore3': 0}, inplace=True)

# Save the DataFrame to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)

In [18]:
cust_id = 'C0001'
similars = top_similars[cust_id]
print(f"Top 3 similar customers for {cust_id}:")
for similar_cust_id, score in similars:
    print(f"CustomerID: {similar_cust_id}, Similarity Score: {score:.4f}")

Top 3 similar customers for C0001:
CustomerID: C0112, Similarity Score: 0.9800
CustomerID: C0192, Similarity Score: 0.9754
CustomerID: C0118, Similarity Score: 0.9732


In [16]:
# Compare profiles
customer_features.loc[[cust_id] + [s[0] for s in similars]]

Unnamed: 0_level_0,Region_Asia,Region_Europe,Region_North America,Region_South America,SignupRecency,TotalSpend,PurchaseFrequency,AvgOrderValue,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
C0001,False,False,False,True,902,3354.52,5.0,670.904,1.0,0.0,3.0,1.0
C0112,False,False,False,True,905,1959.51,3.0,653.17,1.0,0.0,2.0,0.0
C0192,False,False,False,True,835,2072.72,4.0,518.18,1.0,1.0,2.0,0.0
C0118,False,False,False,True,1071,3434.77,6.0,572.461667,2.0,1.0,2.0,1.0
