In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
import numpy as np

# Load datasets
merged_df = pd.read_csv(r"C:\Users\EJ312WS\Downloads\merged_df.csv")


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

def create_customer_features(df):
    """
    Create customer-level features from transaction data
    """
    # Aggregate transaction-level features
    customer_features = df.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean', 'count'],
        'Quantity': ['sum', 'mean'],
        'Category': lambda x: len(x.unique()),  # Number of unique categories
        'ProductID': lambda x: len(x.unique()),  # Number of unique products
        'Month': lambda x: len(x.unique())  # Number of active months
    }).reset_index()
    
    # Flatten column names
    customer_features.columns = ['CustomerID', 'total_spend', 'avg_transaction_value', 
                               'transaction_count', 'total_quantity', 'avg_quantity',
                               'unique_categories', 'unique_products', 'active_months']
    
    # Add customer profile features
    customer_profile = merged_df[['CustomerID', 'Region', 'SignupDate']].drop_duplicates()
    
    # Convert SignupDate to customer age (in days)
    customer_profile['customer_age'] = (pd.to_datetime('2024-08-25') - 
                                      pd.to_datetime(customer_profile['SignupDate'])).dt.days
    
    # One-hot encode region
    region_dummies = pd.get_dummies(customer_profile['Region'], prefix='region')
    customer_profile = pd.concat([customer_profile, region_dummies], axis=1)
    
    # Merge all features
    final_features = customer_features.merge(customer_profile, on='CustomerID')
    
    # Drop unnecessary columns
    final_features.drop(['Region', 'SignupDate'], axis=1, inplace=True)
    
    return final_features

def find_lookalikes(customer_features, target_customer_id, n_recommendations=3):
    """
    Find lookalike customers using cosine similarity
    """
    # Separate CustomerID
    customer_ids = customer_features['CustomerID']
    features = customer_features.drop('CustomerID', axis=1)
    
    # Scale features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(features)
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(scaled_features)
    
    # Get index of target customer
    target_idx = customer_ids[customer_ids == target_customer_id].index[0]
    
    # Get similarity scores for target customer
    similarity_scores = similarity_matrix[target_idx]
    
    # Get indices of top n similar customers (excluding self)
    similar_indices = np.argsort(similarity_scores)[::-1][1:n_recommendations+1]
    
    # Create recommendations dictionary
    recommendations = []
    for idx in similar_indices:
        recommendations.append({
            'customer_id': customer_ids.iloc[idx],
            'similarity_score': round(similarity_scores[idx], 4)
        })
    
    return recommendations

def generate_lookalike_recommendations(df, target_customers):
    """
    Generate lookalike recommendations for a list of target customers
    """
    # Create customer features
    customer_features = create_customer_features(df)
    
    # Generate recommendations for each target customer
    recommendations = {}
    for customer_id in target_customers:
        lookalikes = find_lookalikes(customer_features, customer_id)
        recommendations[customer_id] = lookalikes
    
    return recommendations

# Function to create the output CSV
def create_lookalike_csv(recommendations, output_file='Lookalike.csv'):
    """
    Create CSV file with lookalike recommendations
    """
    rows = []
    for customer_id, lookalikes in recommendations.items():
        # Format lookalikes as a list of tuples (customer_id, score)
        lookalike_list = [(rec['customer_id'], rec['similarity_score']) for rec in lookalikes]
        rows.append({
            'CustomerID': customer_id,
            'Lookalikes': str(lookalike_list)
        })
    
    output_df = pd.DataFrame(rows)
    output_df.to_csv(output_file, index=False)
    return output_df

# Generate recommendations for customers C0001-C0020
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]
recommendations = generate_lookalike_recommendations(merged_df, target_customers)

# Create the output CSV
lookalike_df_threeee = create_lookalike_csv(recommendations)

In [5]:
recommendations

{'C0001': [{'customer_id': 'C0174', 'similarity_score': 0.9654},
  {'customer_id': 'C0011', 'similarity_score': 0.9474},
  {'customer_id': 'C0152', 'similarity_score': 0.9126}],
 'C0002': [{'customer_id': 'C0159', 'similarity_score': 0.8898},
  {'customer_id': 'C0005', 'similarity_score': 0.8885},
  {'customer_id': 'C0134', 'similarity_score': 0.8414}],
 'C0003': [{'customer_id': 'C0129', 'similarity_score': 0.8642},
  {'customer_id': 'C0190', 'similarity_score': 0.7735},
  {'customer_id': 'C0006', 'similarity_score': 0.7373}],
 'C0004': [{'customer_id': 'C0099', 'similarity_score': 0.9381},
  {'customer_id': 'C0102', 'similarity_score': 0.9361},
  {'customer_id': 'C0165', 'similarity_score': 0.9281}],
 'C0005': [{'customer_id': 'C0159', 'similarity_score': 0.981},
  {'customer_id': 'C0007', 'similarity_score': 0.9473},
  {'customer_id': 'C0186', 'similarity_score': 0.9072}],
 'C0006': [{'customer_id': 'C0148', 'similarity_score': 0.8388},
  {'customer_id': 'C0168', 'similarity_score':

# **Customer Lookalike Recommendation Analysis Report**

## **Executive Summary**
This report outlines the methodology and results of generating customer lookalike recommendations based on transactional and profile data. Using advanced data processing techniques, we identified customers with similar purchasing behaviors and characteristics to the target customers, which can be utilized for enhancing marketing strategies, customer engagement, and segmentation.

## **Objectives**
The primary objective of this analysis is to identify **lookalike customers** for a specified group of target customers (C0001 to C0020). By leveraging aggregated transaction data and customer profile information, this analysis aims to provide actionable insights that can guide personalized marketing and customer retention strategies.

## **Data Overview**
The analysis is based on two key datasets:

1. **Transactional Data**: This dataset contains detailed transaction information, including:
   - `CustomerID`: Unique identifier for each customer.
   - `TotalValue`: Total monetary value of the transaction.
   - `Quantity`: The number of products purchased in a given transaction.
   - `ProductID`: Unique identifier for each product purchased.
   - `Category`: Product category.
   - `Month`: The month during which the transaction occurred.

2. **Customer Profile Data**: This dataset includes the following customer-specific details:
   - `CustomerID`: Unique identifier for each customer.
   - `Region`: Geographic location of the customer.
   - `SignupDate`: Date when the customer signed up.

## **Methodology**
The methodology for generating the lookalike recommendations involves the following steps:

### **Step 1: Feature Engineering**
Customer-level features were derived from the transactional data using aggregation functions:

- **Total Spend (TotalValue)**: 
  $$ \text{Total Spend} = \sum (\text{TotalValue}) $$

- **Average Transaction Value**: 
  $$ \text{Average Transaction Value} = \frac{\sum (\text{TotalValue})}{\text{Number of Transactions}} $$

- **Transaction Count**: 
  $$ \text{Transaction Count} = \text{Count of Transactions} $$

- **Total Quantity**: 
  $$ \text{Total Quantity} = \sum (\text{Quantity}) $$

- **Average Quantity per Transaction**: 
  $$ \text{Average Quantity} = \frac{\sum (\text{Quantity})}{\text{Number of Transactions}} $$

- **Unique Categories**: 
  $$ \text{Unique Categories} = \text{Number of Unique Product Categories} $$

- **Unique Products**: 
  $$ \text{Unique Products} = \text{Number of Unique Products} $$

- **Active Months**: 
  $$ \text{Active Months} = \text{Number of Unique Months with Transactions} $$

### **Step 2: Customer Profile Augmentation**
Additional features were derived from the customer profile dataset:

- **Customer Age** (in days) was calculated as the difference between the current date (`2024-08-25`) and the `SignupDate`:
  $$ \text{Customer Age} = \text{Current Date} - \text{SignupDate} $$

- **Region**: One-hot encoding was applied to the `Region` column, where each region is represented as a separate binary feature:
  $$ \text{Region}_i = 
  \begin{cases} 
  1 & \text{if customer is from region i} \\
  0 & \text{otherwise}
  \end{cases} $$

### **Step 3: Normalization**
To ensure comparability across customers, all customer features were scaled using `StandardScaler` from `sklearn`. This step standardizes the data, removing any bias due to differing scales of measurement between features. The formula for standardization is:

$$ \text{Standardized Feature} = \frac{X - \mu}{\sigma} $$

Where:
- \( X \) is the raw feature value,
- \( \mu \) is the mean of the feature, and
- \( \sigma \) is the standard deviation of the feature.

### **Step 4: Cosine Similarity Calculation**
Cosine similarity measures the cosine of the angle between two vectors. Given two vectors \( A \) and \( B \), the formula is:

$$ \text{Cosine Similarity} (A, B) = \frac{A \cdot B}{\|A\| \|B\|} $$

Where:
- \( A \cdot B \) is the dot product of the two vectors,
- \( \|A\| \) is the magnitude (Euclidean norm) of vector \( A \),
- \( \|B\| \) is the magnitude of vector \( B \).

Cosine similarity yields values between -1 (completely dissimilar) and 1 (completely similar). A higher cosine similarity score indicates that two customers are more alike in terms of their purchasing behavior and profile.

### **Step 5: Identification of Lookalike Customers**
For each target customer, the cosine similarity between their feature vector and those of all other customers was computed. The top N most similar customers were identified, where \( N = 3 \) in this analysis. For a given target customer \( i \), the recommendation process is as follows:

1. Calculate the cosine similarity scores between target customer \( i \) and all other customers \( j \):
   $$ \text{similarity}_{ij} = \text{Cosine Similarity}(\mathbf{f}_i, \mathbf{f}_j) $$

2. Rank the customers based on similarity scores in descending order, excluding the target customer from the recommendations.

3. Select the top N (in this case, 3) customers with the highest similarity scores as lookalikes.

## **Results**
The analysis successfully identified the most similar customers for each target customer based on their transaction behaviors and profile characteristics. The following table summarizes the top 3 lookalikes for a selection of target customers:

| **Target Customer** | **Lookalike 1 (CustomerID)** | **Similarity Score** | **Lookalike 2 (CustomerID)** | **Similarity Score** | **Lookalike 3 (CustomerID)** | **Similarity Score** |
|---------------------|------------------------------|----------------------|------------------------------|----------------------|------------------------------|----------------------|
| C0001               | C0174                        | 0.9654               | C0011                        | 0.9474               | C0152                        | 0.9126               |
| C0002               | C0159                        | 0.8898               | C0005                        | 0.8885               | C0134                        | 0.8414               |
| C0003               | C0129                        | 0.8642               | C0190                        | 0.7735               | C0006                        | 0.7373               |

### **CSV Output**
The lookalike recommendations were saved in a CSV file titled `Lookalike.csv`. The file contains the following structure:

```csv
CustomerID,Lookalikes
C0001,"[('C0174', 0.9654), ('C0011', 0.9474), ('C0152', 0.9126)]"
C0002,"[('C0159', 0.8898), ('C0005', 0.8885), ('C0134', 0.8414)]"
C0003,"[('C0129', 0.8642), ('C0190', 0.7735), ('C0006', 0.7373)]"
...
