In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

# Load the data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')

In [3]:
# 1. Feature Engineering

# Merge transactions with product information to get product categories
transactions_with_products = pd.merge(transactions_df, products_df, on='ProductID')

# Calculate total spend and transaction frequency per customer
customer_transactions = transactions_with_products.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()

# Merge the customer profile data (CustomerID, Region) with transaction data
customer_profile = pd.merge(customers_df[['CustomerID', 'Region']], customer_transactions, on='CustomerID')

# One-hot encode 'Region' feature (optional)
customer_profile = pd.get_dummies(customer_profile, columns=['Region'], drop_first=True)

In [4]:

# Merge customer data with transaction data
customer_profile = pd.merge(customers_df[['CustomerID', 'Region']], customer_transactions, on='CustomerID')

# One-hot encode 'Region' feature (if you want to include region info)
customer_profile = pd.get_dummies(customer_profile, columns=['Region'], drop_first=True)

# 2. Standardize features (optional but often recommended for distance metrics)
scaler = StandardScaler()
features = ['total_spent', 'num_transactions', 'avg_transaction_value'] + [col for col in customer_profile.columns if 'Region' in col]
customer_profile[features] = scaler.fit_transform(customer_profile[features])

In [5]:

# 3. Find the Top 3 Lookalikes for Each Customer (C0001 - C0020)

lookalikes = {}

for idx, customer_id in enumerate(customer_profile['CustomerID'][:20]):  # For customers C0001 to C0020
    # Get the customer profile for this customer
    current_customer_profile = customer_profile.iloc[idx][features].values

    # Calculate Euclidean distance to all other customers (excluding self)
    distances = []
    for i, other_customer_id in enumerate(customer_profile['CustomerID']):
        if customer_id != other_customer_id:
            other_customer_profile = customer_profile.iloc[i][features].values
            distance = np.linalg.norm(current_customer_profile - other_customer_profile)  # Euclidean distance
            distances.append((other_customer_id, distance))

    # Sort by distance and select the top 3 most similar customers
    distances.sort(key=lambda x: x[1])  # Sort by distance (smaller distance is better)
    top_3_similar = distances[:3]

    # Add the top 3 similar customers to the lookalikes dictionary
    lookalikes[customer_id] = top_3_similar

# Output the lookalikes dictionary for verification
print(lookalikes)


{'C0001': [('C0137', 0.02207243237942135), ('C0152', 0.031486118833456055), ('C0107', 0.27065604383322567)], 'C0002': [('C0142', 0.4466386109682652), ('C0088', 0.506631079895442), ('C0186', 0.6879097802180525)], 'C0003': [('C0133', 0.18866119376403317), ('C0052', 0.2817395883219162), ('C0137', 0.5676188566833155)], 'C0004': [('C0113', 0.4121640008686923), ('C0108', 0.5425086204408527), ('C0012', 0.5660799258054712)], 'C0005': [('C0159', 0.05753123169168628), ('C0186', 0.4108727733600682), ('C0146', 0.5624411510831706)], 'C0006': [('C0158', 0.606576301183449), ('C0171', 0.6883055535442357), ('C0187', 0.756960196119244)], 'C0007': [('C0092', 0.6813057136076216), ('C0193', 0.6846642754607661), ('C0140', 0.6904936694671368)], 'C0008': [('C0109', 0.8610397272065794), ('C0139', 0.9752697198202058), ('C0098', 1.1052601001342437)], 'C0009': [('C0121', 0.513702341920517), ('C0198', 0.8390069138451481), ('C0010', 0.8439644490738267)], 'C0010': [('C0199', 0.31065299933224544), ('C0111', 0.4989763

In [6]:
# Prepare data for CSV
csv_data = []

for customer_id, similar_customers in lookalikes.items():
    # Convert the list of tuples into a string format
    lookalike_pairs = str(similar_customers)  # Turn the list into a string that can be saved in one cell
    csv_data.append([customer_id, lookalike_pairs])

# Create a DataFrame from the prepared data
lookalike_df = pd.DataFrame(csv_data, columns=['cust_id', 'lookalike_pairs'])

# Save the DataFrame to CSV
lookalike_df.to_csv('Priyansh_Saxena_Lookalike', index=False)

print("Lookalike.csv generated successfully.")

Lookalike.csv generated successfully.
