In [14]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [2]:
##Loading the datsets
customers_df= pd.read_csv("C:/Users/Rahul Thakur/Downloads/Customers.csv")
products_df= pd.read_csv("C:/Users/Rahul Thakur/Downloads/Products.csv")
transactions_df = pd.read_csv("C:/Users/Rahul Thakur/Downloads/Transactions.csv")

In [3]:

# Merge transactions with product data to get product category information
transactions_df = pd.merge(transactions_df, products_df[['ProductID', 'Category']], on='ProductID', how='left')

In [4]:
# Step 1: Aggregate customer data for total spend, transaction count, and category-wise spending
customer_transactions = transactions_df.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count')
).reset_index()

In [5]:
# Aggregate by product category for each customer (how much each customer spent per category)
category_spend = transactions_df.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack().fillna(0)

# Merge customer profile with category spend data
customer_profile = pd.merge(customers_df[['CustomerID', 'Region']], customer_transactions, on='CustomerID')
customer_profile = pd.merge(customer_profile, category_spend, on='CustomerID')


In [6]:
# Step 2: Normalize the numeric features (total_spend, transaction_count, category spend)
scaler = StandardScaler()
customer_profile[['total_spend', 'transaction_count'] + list(category_spend.columns)] = scaler.fit_transform(
    customer_profile[['total_spend', 'transaction_count'] + list(category_spend.columns)]
)


In [7]:
# Step 3: One-hot encode the 'Region' feature
encoder = OneHotEncoder(drop='first', sparse=False)
region_encoded = encoder.fit_transform(customer_profile[['Region']])
region_columns = encoder.categories_[0][1:]

region_df = pd.DataFrame(region_encoded, columns=region_columns)
customer_profile = pd.concat([customer_profile, region_df], axis=1)




In [8]:
# Step 4: Create feature matrix for similarity calculation
features = ['total_spend', 'transaction_count'] + list(category_spend.columns) + list(region_df.columns)


In [9]:
# Create the feature matrix
customer_features = customer_profile[features]


In [10]:
# Step 5: Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features)


In [11]:
# Step 6: Get top 3 similar customers for each customer (C0001 to C0020)
lookalike_dict = {}

for i, customer_id in enumerate(customers_df['CustomerID'][:20]):  # For customers C0001 to C0020
    # Get similarity scores for the current customer
    similarity_scores = similarity_matrix[i]
    
    # Sort the similarity scores in descending order and exclude the customer itself (index i)
    similar_indices = similarity_scores.argsort()[::-1][1:4]  # Skip the first one as it's the customer itself
    
    # Get the CustomerID and similarity score for the top 3 similar customers
    similar_customers = [(customers_df.iloc[j]['CustomerID'], similarity_scores[j]) for j in similar_indices]
    
    # Store the top 3 similar customers and their similarity scores
    lookalike_dict[customer_id] = similar_customers

In [12]:
# Step 7: Convert the lookalike recommendations into a DataFrame for easy export
lookalike_data = []

for cust_id, similar_customers in lookalike_dict.items():
    for idx, (lookalike_id, score) in enumerate(similar_customers):
        lookalike_data.append([cust_id, lookalike_id, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

In [13]:
# Step 8: Save the recommendations to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike recommendations have been saved to 'Lookalike.csv'.")


Lookalike recommendations have been saved to 'Lookalike.csv'.
