<a href="https://colab.research.google.com/github/SuvarshaChennareddy/Zeotap-Data-Science-Assignment/blob/main/Suvarsha_Chennareddy_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Read the data
customers_df = pd.read_csv('/content/drive/MyDrive/Zeotap/Customers.csv')
products_df = pd.read_csv('/content/drive/MyDrive/Zeotap/Products.csv')
transactions_df = pd.read_csv('/content/drive/MyDrive/Zeotap/Transactions.csv')

In [20]:
customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [21]:
products_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [22]:
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


The lookalike model I've created uses the following features to determine similarity:

1. Region (encoded)
2. Purchase behavior across different product categories
3. Average transaction value

The model works by:

1. Creating user profiles that combine demographic and transaction data
2. Applying mean-centering to normalize the features
3. Computing cosine similarity between the centered features
4. Identifying the top 3 most similar customers for each target customer

The similarity scores range from -1 to 1, where:
- 1 indicates perfect similarity
- 0 indicates no similarity
- -1 indicates perfect dissimilarity

The output CSV file (Lookalike.csv) contains:
- CustomerID: The target customer
- Lookalikes: Three most similar customers with their similarity scores in format "CustomerID:score"

In [23]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Read and prepare the data
def prepare_data(customers_df, products_df, transactions_df):

    # One-hot encode 'Region' column
    region_encoded = pd.get_dummies(customers_df['Region'], prefix='Region')
    customers_df = pd.concat([customers_df, region_encoded], axis=1)
    print("Customers with region one hot encoded: ")
    print(customers_df.head())
    print("\n")

    # Create user profiles based on transaction history
    # Get product categories for each transaction
    transactions_with_category = pd.merge(
        transactions_df,
        products_df[['ProductID', 'Category']],
        on='ProductID'
    )


    # Calculate average transaction value per customer
    avg_transaction_value = transactions_df.groupby('CustomerID')['TotalValue'].mean().reset_index()
    avg_transaction_value.columns = ['CustomerID', 'avg_transaction_value']

    print("Average total value of transactions per customer: ")
    print(avg_transaction_value.head())
    print("\n")

    # Calculate purchase frequency per category for each customer
    category_pivot = pd.pivot_table(
        transactions_with_category,
        values='Quantity',
        index='CustomerID',
        columns='Category',
        aggfunc='sum',
        fill_value=0
    )

    print("Category purchase frequency per customer: ")
    print(category_pivot.head())
    print("\n")

    # Merge all features
    user_profiles = pd.merge(
        customers_df[['CustomerID'] + list(region_encoded.columns)],
        category_pivot,
        left_on='CustomerID',
        right_index=True,
        how='left'
    )

    user_profiles = pd.merge(
        user_profiles,
        avg_transaction_value,
        on='CustomerID',
        how='left'
    )

    # Fill NaN values with 0 (for customers with no transactions)
    user_profiles = user_profiles.fillna(0)

    # Scale numerical features (excluding CustomerID)
    numerical_features = user_profiles.columns[1:]  # Exclude 'CustomerID'
    scaler = StandardScaler()
    user_profiles[numerical_features] = scaler.fit_transform(user_profiles[numerical_features])

    print("User profiles (vectors): ")
    print(user_profiles.head())
    print("\n")

    return user_profiles, scaler

In [24]:
def mean_centered_cosine_similarity(features):
    # Calculate mean for each feature
    feature_means = np.mean(features, axis=0)

    # Center the features
    centered_features = features - feature_means

    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(centered_features)

    return similarity_matrix

In [25]:
def get_top_lookalikes(customer_idx, similarity_matrix, user_profiles, n=3):
    # Get similarity scores for the customer
    customer_similarities = similarity_matrix[customer_idx]

    # Create DataFrame with CustomerID and similarity score
    similarities_df = pd.DataFrame({
        'CustomerID': user_profiles['CustomerID'],
        'similarity_score': customer_similarities
    })

    # Sort by similarity score and exclude the customer themselves
    top_lookalikes = similarities_df[similarities_df['CustomerID'] != user_profiles.iloc[customer_idx]['CustomerID']]
    top_lookalikes = top_lookalikes.sort_values('similarity_score', ascending=False).head(n)

    return top_lookalikes

In [26]:
# Convert data types
transactions_df['Quantity'] = transactions_df['Quantity'].astype(int)
transactions_df['TotalValue'] = transactions_df['TotalValue'].astype(float)

# Prepare user profiles
user_profiles, scaler = prepare_data(customers_df, products_df, transactions_df)

# Calculate similarity matrix
similarity_matrix = mean_centered_cosine_similarity(user_profiles.iloc[:, 1:].values)

# Generate lookalikes for first 20 customers
results = []
for i in range(20):
    customer_id = user_profiles.iloc[i]['CustomerID']
    lookalikes = get_top_lookalikes(i, similarity_matrix, user_profiles)

    # Format result string
    lookalike_str = '; '.join([
        f"{row['CustomerID']}:{row['similarity_score']:.4f}"
        for _, row in lookalikes.iterrows()
    ])

    results.append(f"{customer_id},{lookalike_str}")

# Save results to CSV
with open('Suvarsha_Chennareddy_Lookalike.csv', 'w') as f:
    f.write("CustomerID,Lookalikes\n")
    for result in results:
        f.write(f"{result}\n")


# Display results
print("Top 3 lookalikes for first 20 customers:")
for result in results:
    print(result)

Customers with region one hot encoded: 
  CustomerID        CustomerName         Region  SignupDate  Region_Asia  \
0      C0001    Lawrence Carroll  South America  2022-07-10        False   
1      C0002      Elizabeth Lutz           Asia  2022-02-13         True   
2      C0003      Michael Rivera  South America  2024-03-07        False   
3      C0004  Kathleen Rodriguez  South America  2022-10-09        False   
4      C0005         Laura Weber           Asia  2022-08-15         True   

   Region_Europe  Region_North America  Region_South America  
0          False                 False                  True  
1          False                 False                 False  
2          False                 False                  True  
3          False                 False                  True  
4          False                 False                 False  


Average total value of transactions per customer: 
  CustomerID  avg_transaction_value
0      C0001                670.904
