<a href="https://colab.research.google.com/github/SetuKaswan/zeotap/blob/main/Setu_Kaswan_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries


In [21]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
customers_url = 'https://raw.githubusercontent.com/SetuKaswan/zeotap/refs/heads/main/Customers.csv'
products_url = 'https://raw.githubusercontent.com/SetuKaswan/zeotap/refs/heads/main/Products.csv'
transactions_url = 'https://raw.githubusercontent.com/SetuKaswan/zeotap/refs/heads/main/Transactions.csv'

In [15]:
from urllib.request import urlretrieve
urlretrieve(customers_url, 'customers.csv')
urlretrieve(products_url, 'products.csv')
urlretrieve(transactions_url, 'transactions.csv')

('transactions.csv', <http.client.HTTPMessage at 0x7fd943470590>)

Creating dataframes

In [17]:
customers_df = pd.read_csv('customers.csv')
products_df = pd.read_csv('products.csv')
transactions_df = pd.read_csv('transactions.csv')

# **Data cleaning**

In [24]:
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

customers_df.isnull().sum(), transactions_df.isnull().sum(), products_df.isnull().sum()
transactions_df.drop_duplicates(inplace=True)

# Aggregate transaction data

In [25]:
customer_metrics = transactions_df.groupby('CustomerID').agg({
    'TransactionID': 'count',  # Number of transactions
    'TotalValue': 'sum',       # Total spend
    'Quantity': 'sum',         # Total items bought
    'ProductID': lambda x: len(set(x))  # Unique products bought
}).rename(columns={
    'TransactionID': 'transaction_count',
    'TotalValue': 'total_spend',
    'Quantity': 'total_items',
    'ProductID': 'unique_products'
})

customer_metrics['avg_order_value'] = customer_metrics['total_spend'] / customer_metrics['transaction_count']

product_categories = products_df[['ProductID', 'Category']].set_index('ProductID')
txn_with_categories = transactions_df.merge(product_categories, on='ProductID')

category_preferences = pd.get_dummies(txn_with_categories['Category'])
category_preferences = category_preferences.mul(txn_with_categories['Quantity'], axis=0)
category_preferences = category_preferences.groupby(txn_with_categories['CustomerID']).sum()

customer_features = pd.merge(
    customer_metrics,
    category_preferences,
    left_index=True,
    right_index=True,
    how='left'
)

In [26]:
scaler = StandardScaler()
features_normalized = scaler.fit_transform(customer_features)
features_normalized = pd.DataFrame(
    features_normalized,
    columns=customer_features.columns,
    index=customer_features.index
)

# Finding similarity score

In [27]:
def get_lookalikes(customer_id, features_df, n_recommendations=3):

    customer_vector = features_df.loc[customer_id].values.reshape(1, -1)
    similarity_scores = cosine_similarity(customer_vector, features_df)

    similar_indices = similarity_scores[0].argsort()[::-1][1:n_recommendations+1]

    similar_customers = [
        (features_df.index[idx], similarity_scores[0][idx])
        for idx in similar_indices
    ]

    return similar_customers

In [28]:
results = {}
for cust_id in customers_df['CustomerID'][:20]:
    lookalikes = get_lookalikes(cust_id, features_normalized)
    results[cust_id] = lookalikes

# Storing the output

In [30]:
output_rows = []
for cust_id, recommendations in results.items():
    for rank, (rec_id, score) in enumerate(recommendations, 1):
        output_rows.append({
            'source_customer': cust_id,
            'recommended_customer': rec_id,
            'similarity_score': score,
            'rank': rank
        })

output_df = pd.DataFrame(output_rows)
output_df.to_csv('Setu_Kaswan_Lookalike.csv', index=False)

In [32]:
from google.colab import files
files.download('Setu_Kaswan_Lookalike.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>