In [1]:
import pandas as pd
customers = pd.read_csv('C:/Users/sunit/Desktop/eCommerce_Transaction Dataset/data/Customers.csv')
products = pd.read_csv('C:/Users/sunit/Desktop/eCommerce_Transaction Dataset/data/Products.csv')
transactions = pd.read_csv('C:/Users/sunit/Desktop/eCommerce_Transaction Dataset/data/Transactions.csv')


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [7]:

# Merge datasets
merged = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')

# Rename or drop conflicting columns
merged = merged.rename(columns={'Price_y': 'Price'}).drop(columns=['Price_x'])

# Verify that the 'Price' column is correctly named and present
print(merged.columns)

# Feature engineering
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price': 'mean',
    'Category': lambda x: x.mode()[0],  # Most purchased category
    'Region': 'first'
}).reset_index()

# Encode categorical variables
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'])

# Display the first few rows of the transformed data
print(customer_features.head())

# Scale numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))

# Compute similarity
similarity_matrix = cosine_similarity(scaled_features)

# Get top 3 lookalikes for each customer
lookalikes = {}
for idx, customer_id in enumerate(customer_features['CustomerID']):
    scores = similarity_matrix[idx]
    similar_customers = sorted(
        [(customer_features['CustomerID'][i], score) for i, score in enumerate(scores) if i != idx],
        key=lambda x: x[1], reverse=True
    )[:3]
    lookalikes[customer_id] = similar_customers

# Output results for customers C0001 - C0020
result = {cust_id: lookalikes[cust_id] for cust_id in customer_features['CustomerID'] if cust_id in [f'C{str(i).zfill(4)}' for i in range(1, 21)]}

# Save to CSV
pd.DataFrame({'CustomerID': result.keys(), 'Lookalikes': [v for v in result.values()]}).to_csv('Lookalike.csv', index=False)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'ProductName', 'Category', 'Price',
       'CustomerName', 'Region', 'SignupDate'],
      dtype='object')
  CustomerID  TotalValue  Quantity       Price  Category_Books  \
0      C0001     3354.52        12  278.334000           False   
1      C0002     1862.74        10  208.920000           False   
2      C0003     2725.38        14  195.707500           False   
3      C0004     5354.88        23  240.636250            True   
4      C0005     2034.24         7  291.603333           False   

   Category_Clothing  Category_Electronics  Category_Home Decor  Region_Asia  \
0              False                  True                False        False   
1               True                 False                False         True   
2              False                 False                 True        False   
3              False                 False                False        Fal