In [1]:
import pandas as pd

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Inspect the merged data
print(data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [6]:
# Step 2: Feature Engineering
# Aggregate customer transaction behavior
from sklearn.preprocessing import MinMaxScaler
customer_features = data.groupby('CustomerID').agg(
    avg_transaction_value=('TotalValue', 'mean'),
    total_quantity=('Quantity', 'sum'),
    total_spent=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count')
).reset_index()

# One-hot encode product categories
category_features = pd.get_dummies(data[['CustomerID', 'Category']], columns=['Category']).groupby('CustomerID').sum()

# One-hot encode regions
region_features = pd.get_dummies(customers[['CustomerID', 'Region']], columns=['Region'])

# Merge all features
customer_features = customer_features.merge(category_features, on='CustomerID').merge(region_features, on='CustomerID')

# Normalize all features except CustomerID
scaler = MinMaxScaler()
customer_features.iloc[:, 1:] = scaler.fit_transform(customer_features.iloc[:, 1:])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Compute cosine similarity
similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:])

# Convert to DataFrame for readability
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.267022  0.927161  0.917366  0.397469  0.870462   
C0002       0.267022  1.000000  0.358195  0.360631  0.872051  0.341788   
C0003       0.927161  0.358195  1.000000  0.884876  0.338675  0.919594   
C0004       0.917366  0.360631  0.884876  1.000000  0.377395  0.878691   
C0005       0.397469  0.872051  0.338675  0.377395  1.000000  0.329134   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.425756  0.508752  0.204098  0.249427  ...  0.942657  0.963312   
C0002       0.859161  0.423893  0.215560  0.332172  ...  0.217605  0.241688   
C0003       0.374572  0.482613  0.207258  0.306821  ...  0.850345  0.926094   
C0004       0.405042  0.626968  0.189934  0.335513  ...  0.921313  0.855769   
C0005  

In [9]:
top_lookalikes = {}

for customer in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer].sort_values(ascending=False).iloc[1:4]
    top_lookalikes[customer] = list(zip(similar_customers.index, similar_customers.values))

# Save lookalikes to Lookalike.csv
import csv

with open('Vivek_Shinde_Lookalike.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['CustomerID', 'Lookalikes'])
    for key, value in top_lookalikes.items():
        writer.writerow([key, value])

print("Top 3 lookalikes for the first 20 customers:")
print(top_lookalikes)


Top 3 lookalikes for the first 20 customers:
{'C0001': [('C0190', 0.9781888059001674), ('C0048', 0.9780352189707089), ('C0112', 0.9697814096278696)], 'C0002': [('C0134', 0.982368582331819), ('C0106', 0.9803726659994487), ('C0159', 0.9717977789707041)], 'C0003': [('C0031', 0.9897139499940381), ('C0158', 0.9808906213474147), ('C0129', 0.9799176306375933)], 'C0004': [('C0113', 0.9871348783305397), ('C0012', 0.9790985357357634), ('C0104', 0.9725606963177563)], 'C0005': [('C0007', 0.9940970926643758), ('C0146', 0.9832411149217579), ('C0186', 0.9755871050965115)], 'C0006': [('C0187', 0.9882916308519284), ('C0011', 0.958010258699253), ('C0085', 0.9578973985453871)], 'C0007': [('C0005', 0.9940970926643758), ('C0140', 0.9801785123102446), ('C0115', 0.9771279324630998)], 'C0008': [('C0098', 0.9648222565161969), ('C0194', 0.9545173966367587), ('C0059', 0.9524379580307134)], 'C0009': [('C0198', 0.9730500393437196), ('C0062', 0.9507153265596022), ('C0010', 0.9328193029992267)], 'C0010': [('C0111', 

In [12]:
# Function to recommend top 3 similar users
def recommend_top_similar_users(user_id):
    if user_id not in similarity_df.index:
        print("Invalid UserID. Please provide a valid UserID from the dataset.")
        return

    # Get the similarity scores for the input user
    similar_customers = similarity_df[user_id].sort_values(ascending=False).iloc[1:4]  # Exclude the user itself

    print(f"\nTop 3 similar users for UserID {user_id}:\n")
    
    for similar_user, score in similar_customers.items():
        print(f"UserID: {similar_user}, Similarity Score: {score:.2f}")
        # Display full data for the similar user
        user_data = customers[customers['CustomerID'] == similar_user]
        print("User Details:")
        print(user_data.to_string(index=False))
        print("-" * 40)

# Take user input for UserID
input_user_id = input("Enter a UserID to find similar users (e.g., 'C0001'): ").strip()

# Recommend top 3 similar users
recommend_top_similar_users(input_user_id)


Top 3 similar users for UserID C0001:

UserID: C0190, Similarity Score: 0.98
User Details:
CustomerID   CustomerName        Region SignupDate
     C0190 Alexander Lowe South America 2024-05-31
----------------------------------------
UserID: C0048, Similarity Score: 0.98
User Details:
CustomerID CustomerName        Region SignupDate
     C0048 Matthew Park South America 2024-11-07
----------------------------------------
UserID: C0112, Similarity Score: 0.97
User Details:
CustomerID CustomerName        Region SignupDate
     C0112 Brian Adkins South America 2022-07-07
----------------------------------------
