In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge transactions with product details
transactions = transactions.merge(products, on='ProductID', how='left')

# Feature engineering: Aggregate data at the customer level
customer_features = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spend
    'Quantity': 'sum',  # Total quantity purchased
    'Price': 'mean',  # Average price of products purchased
    'Category': lambda x: x.mode()[0] if len(x.mode()) > 0 else 'Unknown'  # Most frequent category
}).reset_index()

# One-hot encode the product categories
customer_features = pd.get_dummies(customer_features, columns=['Category'])

# Standardize numerical features
scaler = StandardScaler()
numerical_cols = ['TotalValue', 'Quantity', 'Price']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

# Compute cosine similarity matrix
feature_matrix = customer_features.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(feature_matrix)

# Find top 3 lookalikes for each customer
def get_top_lookalikes(similarity_matrix, customer_ids, top_n=3):
    lookalikes = {}
    for idx, customer_id in enumerate(customer_ids):
        # Get similarity scores for the current customer
        scores = list(enumerate(similarity_matrix[idx]))
        # Sort by score (highest first), exclude self-comparison
        scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
        # Map customer ID to top N lookalikes with their scores
        lookalikes[customer_id] = [(customer_ids[i], score) for i, score in scores]
    return lookalikes

# Generate Lookalike.csv
customer_ids = customer_features['CustomerID'].values
lookalike_map = get_top_lookalikes(similarity_matrix, customer_ids)

# Convert lookalikes to a DataFrame for CSV output
lookalike_list = []
for cust_id, lookalikes in lookalike_map.items():
    lookalike_list.append({
        'Customer_ID': cust_id,
        'Lookalikes': lookalikes
    })

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Example output for the first 20 customers
for cust_id in customer_ids[:20]:
    print(f"Customer {cust_id} -> {lookalike_map[cust_id]}")


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
products = pd.read_csv('Products.csv')
products

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.30
1,P002,ActiveWear Smartwatch,Electronics,346.30
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31
...,...,...,...,...
95,P096,SoundWave Headphones,Electronics,307.47
96,P097,BookWorld Cookbook,Books,319.34
97,P098,SoundWave Laptop,Electronics,299.93
98,P099,SoundWave Mystery Book,Books,354.29


In [6]:
transactions = pd.read_csv('Transactions.csv')
transactions

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68
...,...,...,...,...,...,...,...
995,T00496,C0118,P037,2024-10-24 08:30:27,1,459.86,459.86
996,T00759,C0059,P037,2024-06-04 02:15:24,3,1379.58,459.86
997,T00922,C0018,P037,2024-04-05 13:05:32,4,1839.44,459.86
998,T00959,C0115,P037,2024-09-29 10:16:02,2,919.72,459.86


In [32]:
# Rename the 'Price' column in products_subset to avoid conflict
products_subset = products[['ProductID', 'Category', 'Price']].rename(columns={'Price': 'ProductPrice'})


transactions = transactions.merge(products_subset, on='ProductID', how='left', suffixes=('', '_Product'))

# Check columns after the merge
print(transactions.columns)
print(products_subset.columns)


# Check for 'Category' column presence
if 'Category' not in transactions.columns:
    print("Error: 'Category' column is missing. Check Products.csv for a 'Category' column.")

# Feature engineering: Aggregate data at the customer level
customer_features = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spend
    'Quantity': 'sum',  # Total quantity purchased
    'ProductPrice': 'mean',  # Average price of products purchased
    'Category': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'  # Most frequent category
}).reset_index()

print(customer_features.head())


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'ProductName_x', 'Category_x',
       'Price_y', 'ProductName_y', 'Category_y', 'Price', 'Category_trans',
       'ProductPrice_trans', 'Category_prod', 'ProductPrice_prod', 'Category',
       'ProductPrice', 'Category_Product', 'ProductPrice_Product'],
      dtype='object')
Index(['ProductID', 'Category', 'ProductPrice'], dtype='object')
  CustomerID  TotalValue  Quantity  ProductPrice     Category
0      C0001     3354.52        12    278.334000  Electronics
1      C0002     1862.74        10    208.920000     Clothing
2      C0003     2725.38        14    195.707500   Home Decor
3      C0004     5354.88        23    240.636250        Books
4      C0005     2034.24         7    291.603333  Electronics


In [34]:
customer_features = pd.get_dummies(customer_features, columns=['Category'])
customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductPrice,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,3354.52,12,278.334000,False,False,True,False
1,C0002,1862.74,10,208.920000,False,True,False,False
2,C0003,2725.38,14,195.707500,False,False,False,True
3,C0004,5354.88,23,240.636250,True,False,False,False
4,C0005,2034.24,7,291.603333,False,False,True,False
...,...,...,...,...,...,...,...,...
194,C0196,4982.88,12,416.992500,False,False,False,True
195,C0197,1928.65,9,227.056667,False,False,True,False
196,C0198,931.83,3,239.705000,False,True,False,False
197,C0199,1979.28,9,250.610000,False,False,True,False


In [38]:
scaler = StandardScaler()
numerical_cols = ['TotalValue', 'Quantity', 'ProductPrice']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])
customer_features

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductPrice,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor
0,C0001,-0.061701,-0.122033,0.094670,False,False,True,False
1,C0002,-0.877744,-0.448000,-0.904016,False,True,False,False
2,C0003,-0.405857,0.203934,-1.094109,False,False,False,True
3,C0004,1.032547,1.670787,-0.447702,True,False,False,False
4,C0005,-0.783929,-0.936951,0.285581,False,False,True,False
...,...,...,...,...,...,...,...,...
194,C0196,0.829053,-0.122033,2.089604,False,False,False,True
195,C0197,-0.841689,-0.610984,-0.643077,False,False,True,False
196,C0198,-1.386975,-1.588886,-0.461100,False,True,False,False
197,C0199,-0.813993,-0.610984,-0.304206,False,False,True,False


In [40]:
feature_matrix = customer_features.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(feature_matrix)

In [42]:
def get_top_lookalikes(similarity_matrix, customer_ids, top_n=3):
    lookalikes = {}
    for idx, customer_id in enumerate(customer_ids):
        # Get similarity scores for the current customer
        scores = list(enumerate(similarity_matrix[idx]))
        # Sort by score (highest first), exclude self-comparison
        scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
        # Map customer ID to top N lookalikes with their scores
        lookalikes[customer_id] = [(customer_ids[i], score) for i, score in scores]
    return lookalikes

In [44]:
customer_ids = customer_features['CustomerID'].values
lookalike_map = get_top_lookalikes(similarity_matrix, customer_ids)

# Convert lookalikes to a DataFrame for CSV output
lookalike_list = []
for cust_id, lookalikes in lookalike_map.items():
    lookalike_list.append({
        'Customer_ID': cust_id,
        'Lookalikes': lookalikes
    })

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Example output for the first 20 customers
for cust_id in customer_ids[:20]:
    print(f"Customer {cust_id} -> {lookalike_map[cust_id]}")

Customer C0001 -> [('C0069', 0.9324018454753686), ('C0154', 0.923507363502334), ('C0026', 0.8874896445880608)]
Customer C0002 -> [('C0029', 0.999813684565344), ('C0088', 0.984786970122018), ('C0062', 0.9810852132937548)]
Customer C0003 -> [('C0038', 0.9938320157333372), ('C0160', 0.9439547841946113), ('C0189', 0.9272584290424002)]
Customer C0004 -> [('C0075', 0.9890562895786139), ('C0041', 0.9830577574234166), ('C0175', 0.9826840394401017)]
Customer C0005 -> [('C0192', 0.995718940217707), ('C0140', 0.9899379450022263), ('C0186', 0.9758473072379672)]
Customer C0006 -> [('C0187', 0.9497359319245032), ('C0117', 0.9456213700238187), ('C0168', 0.9371224523840914)]
Customer C0007 -> [('C0146', 0.9958711037026335), ('C0115', 0.9630039760834646), ('C0050', 0.9598404807403083)]
Customer C0008 -> [('C0113', 0.9893602460323576), ('C0136', 0.9808325118516203), ('C0195', 0.9700783025191715)]
Customer C0009 -> [('C0150', 0.997311664094273), ('C0061', 0.972018762628306), ('C0198', 0.9360077645399814)