In [37]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime


In [38]:
# Load datasets
customers = pd.read_csv(r"D:\Zeotap\Dataset\Customers.csv")
products = pd.read_csv(r"D:\Zeotap\Dataset\Products.csv")
transactions = pd.read_csv(r"D:\Zeotap\Dataset\Transactions.csv")


In [39]:
# Convert dates
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [40]:
# Feature Engineering
# Customer features
customer_features = customers.copy()
customer_features['DaysSinceSignup'] = (datetime.now() - customer_features['SignupDate']).dt.days

# Transaction features
transaction_features = transactions.copy()
transaction_features['DaysSinceTransaction'] = (datetime.now() - transaction_features['TransactionDate']).dt.days

# Merge transactions with products for category info
merged = pd.merge(transaction_features, products, on='ProductID')

# Calculate features for each customer
customer_spend = merged.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean', 'count'],
    'Quantity': 'sum',
    'DaysSinceTransaction': 'min'
}).reset_index()
customer_spend.columns = ['CustomerID', 'TotalSpend', 'AverageOrderValue', 'TotalTransactions', 'TotalQuantity', 'DaysSinceLastTransaction']

# One-hot encode categories for preference similarity
category_preferences = pd.get_dummies(merged['Category'], prefix='Category')
category_preferences = category_preferences.groupby(merged['CustomerID']).sum().reset_index()

# Combine all features
final_features = pd.merge(customer_features[['CustomerID', 'Region', 'DaysSinceSignup']], customer_spend, on='CustomerID')
final_features = pd.merge(final_features, category_preferences, on='CustomerID')

# Prepare data for similarity calculation
# Convert categorical variables to dummy variables
final_features = pd.get_dummies(final_features, columns=['Region'])
features = final_features.drop('CustomerID', axis=1)

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


In [41]:

# Function to find lookalikes
def find_lookalikes(customer_id, n=3):
    if customer_id not in final_features['CustomerID'].values:
        return "Customer ID not found in dataset"
    
    # Index of the customer
    idx = final_features[final_features['CustomerID'] == customer_id].index[0]
    
    # Calculate cosine similarity
    similarity_scores = cosine_similarity([scaled_features[idx]], scaled_features)[0]
    
    # Sort by similarity score, exclude the customer themselves
    similar_customers = sorted(enumerate(similarity_scores), key=lambda x: x[1], reverse=True)[1:n+1]
    
    # Return the top N similar customers with their similarity scores
    lookalikes = []
    for i, score in similar_customers:
        lookalike_id = final_features.loc[i, 'CustomerID']
        lookalikes.append((lookalike_id, score))
    return lookalikes

In [42]:
# Example usage
# Here we're checking if the customer ID exists before calling the function to avoid the error
customer_id_to_check = "C0003"  # Example CustomerID, replace with an actual ID from your dataset
if customer_id_to_check in final_features['CustomerID'].values:
    lookalikes = find_lookalikes(customer_id_to_check)
    for customer, score in lookalikes:
        print(f"CustomerID: {customer}, Similarity Score: {score:.4f}")
else:
    print(f"Customer ID {customer_id_to_check} not found in dataset")

CustomerID: C0129, Similarity Score: 0.8358
CustomerID: C0031, Similarity Score: 0.8095
CustomerID: C0052, Similarity Score: 0.7877


In [45]:
first_20_customers = customers['CustomerID'].head(20).tolist()

# Dictionary to store lookalikes
lookalikes_dict = {}

for customer_id in first_20_customers:
    if customer_id in final_features['CustomerID'].values:
        lookalikes = find_lookalikes(customer_id)
        # Format lookalikes as a single string for each customer
        lookalikes_str = ";".join([f"{lookalike[0]},{lookalike[1]:.4f}" for lookalike in lookalikes])
        lookalikes_dict[customer_id] = lookalikes_str
    else:
        lookalikes_dict[customer_id] = ""  # If customer ID not found, store an empty string

# Create DataFrame from dictionary
lookalikes_df = pd.DataFrame.from_dict(lookalikes_dict, orient='index', columns=['Lookalikes'])

# Save to CSV
lookalikes_df.to_csv('D:\Zeotap\Code\Vaibhav_Hanbar_Lookalike.csv')

print("Lookalike.csv has been created with the top 3 lookalikes for the first 20 customers.")

Lookalike.csv has been created with the top 3 lookalikes for the first 20 customers.


In [48]:
pd.read_csv(r"D:\Zeotap\Code\Vaibhav_Hanbar_Lookalike.csv")

Unnamed: 0.1,Unnamed: 0,Lookalikes
0,C0001,"C0118,0.7840;C0120,0.7695;C0091,0.7224"
1,C0002,"C0134,0.9198;C0106,0.9173;C0159,0.9089"
2,C0003,"C0129,0.8358;C0031,0.8095;C0052,0.7877"
3,C0004,"C0113,0.9379;C0104,0.8571;C0012,0.7765"
4,C0005,"C0007,0.9246;C0140,0.8580;C0186,0.8378"
5,C0006,"C0187,0.8889;C0171,0.6940;C0168,0.6880"
6,C0007,"C0005,0.9246;C0140,0.8613;C0186,0.7283"
7,C0008,"C0098,0.8095;C0194,0.7871;C0059,0.7853"
8,C0009,"C0198,0.8682;C0060,0.8081;C0062,0.7951"
9,C0010,"C0061,0.8453;C0009,0.7852;C0132,0.7614"


# Finish