In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load data
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge transactions with customers and products
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')
merged_data = pd.merge(merged_data, products, on='ProductID', how='left')

creating customer profile features and transaction behavior features

In [3]:
from datetime import datetime

# Feature 1: TotalSpent
total_spent = merged_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spent.columns = ['CustomerID', 'TotalSpent']

# Feature 2: TransactionFrequency
transaction_freq = merged_data.groupby('CustomerID')['TransactionID'].count().reset_index()
transaction_freq.columns = ['CustomerID', 'TransactionFrequency']

# Feature 3: AvgOrderValue
avg_order_value = merged_data.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_order_value.columns = ['CustomerID', 'AvgOrderValue']

# Feature 4: FavoriteCategory
favorite_category = merged_data.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['Count'].idxmax()]
favorite_category = favorite_category[['CustomerID', 'Category']]
favorite_category.columns = ['CustomerID', 'FavoriteCategory']

# Feature 5: Tenure (days since SignupDate)
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['Tenure'] = (datetime.now() - customers['SignupDate']).dt.days

# Merge all features
customer_features = pd.merge(customers, total_spent, on='CustomerID', how='left')
customer_features = pd.merge(customer_features, transaction_freq, on='CustomerID', how='left')
customer_features = pd.merge(customer_features, avg_order_value, on='CustomerID', how='left')
customer_features = pd.merge(customer_features, favorite_category, on='CustomerID', how='left')

In [4]:
# One-hot encode Region and FavoriteCategory
customer_features = pd.get_dummies(customer_features, columns=['Region', 'FavoriteCategory'], drop_first=True)

In [5]:
# Normalize numerical features
scaler = MinMaxScaler()
numerical_features = ['TotalSpent', 'TransactionFrequency', 'AvgOrderValue', 'Tenure']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

In [6]:
# Handle missing values
customer_features = customer_features.dropna() 

# Drop non-feature columns
feature_columns = customer_features.drop(columns=['CustomerID', 'CustomerName', 'SignupDate'])

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_columns)

In [7]:
# Convert similarity matrix to DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [8]:
# Function to get top 3 similar customers
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    similarities = similarity_df[customer_id].sort_values(ascending=False)
    top_similarities = similarities.iloc[1:top_n+1]  # Exclude the customer itself
    return list(zip(top_similarities.index, top_similarities.values))

# Example: Get top 3 similar customers for C0001
top_similar_customers = get_top_similar_customers('C0001', similarity_df)
print(top_similar_customers)

[('C0192', 0.994281729689659), ('C0112', 0.9911771279314008), ('C0184', 0.9903750855502597)]


In [9]:
# Generate recommendations for the first 20 customers
recommendations = {}
for customer_id in customer_features['CustomerID'][:20]:
    recommendations[customer_id] = get_top_similar_customers(customer_id, similarity_df)

# Convert to the required format
lookalike_data = {cust_id: [(sim_cust, score) for sim_cust, score in sim_list] for cust_id, sim_list in recommendations.items()}

In [10]:
import csv

# Save to CSV
with open('Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'SimilarCustomers'])
    for cust_id, sim_list in lookalike_data.items():
        writer.writerow([cust_id, sim_list])