# Task 2: Lookalike Model

In [1]:
# Importing necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading datasets
customers = pd.read_csv("Customers.csv")  # Customer details
products = pd.read_csv("Products.csv")  # Product details
transactions = pd.read_csv("Transactions.csv")  # Transaction details

In [3]:
# Merging transactions with customer data
transactions_with_customer = pd.merge(transactions, customers, on="CustomerID", how="left")

In [4]:
# Merging the above with product data
data = pd.merge(transactions_with_customer, products, on="ProductID", how="left")

In [5]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [6]:
# Creating a profile matrix where each customer has features based on product categories
profile_df = data.groupby(['CustomerID', 'Category']).agg({'Quantity': 'sum'}).unstack(fill_value=0)
profile_df.columns = profile_df.columns.droplevel() 

In [7]:
profile_df

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,2,0,7,3
C0002,0,4,0,6
C0003,0,4,4,6
C0004,8,0,6,9
C0005,0,0,4,3
...,...,...,...,...
C0196,3,4,0,5
C0197,0,0,6,3
C0198,0,2,1,0
C0199,0,0,3,6


In [8]:
# Standardizing the profile matrix (important for cosine similarity)
scaler = StandardScaler()
profile_df_scaled = pd.DataFrame(scaler.fit_transform(profile_df), columns=profile_df.columns, index=profile_df.index)

In [10]:
profile_df_scaled

Category,Books,Clothing,Electronics,Home Decor
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,-0.464594,-0.963893,1.255863,-0.069051
C0002,-1.117981,0.336546,-1.027971,0.912454
C0003,-1.117981,0.336546,0.277077,0.912454
C0004,1.495566,-0.963893,0.929601,1.893958
C0005,-1.117981,-0.963893,0.277077,-0.069051
...,...,...,...,...
C0196,-0.137901,0.336546,-1.027971,0.585286
C0197,-1.117981,-0.963893,0.929601,-0.069051
C0198,-1.117981,-0.313674,-0.701709,-1.050555
C0199,-1.117981,-0.963893,-0.049185,0.912454


In [11]:
# Define a function to compute similarity
def compute_similarity(customer_profile, profile_df_scaled):
    # Compute cosine similarity between the input customer and all other customers
    similarities = cosine_similarity(customer_profile, profile_df_scaled)
    return similarities.flatten()

In [12]:
# Prepare the result for Lookalikes
lookalikes = {}

In [13]:
# Loop for first 20 customers and find their lookalikes
for customer_id in customers['CustomerID'][:20]:
    # Get the customer's profile vector
    customer_profile = profile_df_scaled.loc[customer_id].values.reshape(1, -1)
    
    #calculate the similarity score with all customers
    similarity_scores = compute_similarity(customer_profile, profile_df_scaled)
    
    #sort the customers by similarity score (ignore the customer's own similarity score)
    similar_customers = [(customers['CustomerID'][i], similarity_scores[i]) for i in range(len(similarity_scores)) if customers['CustomerID'][i] != customer_id]
    
    # Sort by descending similarity score and get the top 3 lookalikes
    top_lookalikes = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]
    
    # Map customers to their lookalikes and their similarity scores
    lookalikes[customer_id] = top_lookalikes

In [14]:
# Convert the result into a DataFrame and save it to a CSV
lookalike_df = pd.DataFrame([
    {'CustomerID': key, 'Lookalikes': [f"{cust[0]}: {cust[1]}" for cust in value]} for key, value in lookalikes.items()
])

In [15]:
# Saving the lookalikes to a CSV
lookalike_df.to_csv("Priya_Chanchal_Lookalike.csv", index=False)