In [2]:
# Importing Library
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 1. Load and prepare data
def prepare_customer_features():
    # Load datasets
    customers_df = pd.read_csv('/content/Customers.csv')
    transactions_df = pd.read_csv('/content/Transactions.csv')

    # Convert dates to datetime
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

    # Create customer transaction features
    transaction_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Frequency
        'TotalValue': ['sum', 'mean'],  # Monetary
        'Quantity': ['sum', 'mean'],  # Volume
        'TransactionDate': lambda x: (x.max() - x.min()).days  # Time span
    }).reset_index()

    # Flatten column names
    transaction_features.columns = ['CustomerID', 'transaction_count', 'total_spend',
                                  'avg_transaction_value', 'total_quantity', 'avg_quantity',
                                  'purchase_timespan']

    # Create RFM features
    latest_date = transactions_df['TransactionDate'].max()
    last_purchase = transactions_df.groupby('CustomerID')['TransactionDate'].max()
    recency = (latest_date - last_purchase).dt.days

    transaction_features['recency'] = recency

    # Encode regions using one-hot encoding
    region_dummies = pd.get_dummies(customers_df['Region'], prefix='region')
    customers_df = pd.concat([customers_df, region_dummies], axis=1)

    # Calculate signup recency
    latest_signup = customers_df['SignupDate'].max()
    customers_df['signup_recency'] = (latest_signup - customers_df['SignupDate']).dt.days

    # Merge all features
    feature_columns = ['CustomerID', 'signup_recency'] + [col for col in region_dummies.columns]
    customer_features = pd.merge(
        customers_df[feature_columns],
        transaction_features,
        on='CustomerID',
        how='left'
    )

    # Fill NaN values for customers with no transactions
    customer_features = customer_features.fillna(0)

    return customer_features

In [4]:
# 2. Create similarity model
def create_similarity_model(customer_features):
    # Select features for similarity calculation
    feature_cols = [col for col in customer_features.columns if col != 'CustomerID']

    # Scale features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(customer_features[feature_cols])

    # Calculate similarity matrix
    similarity_matrix = cosine_similarity(scaled_features)

    return similarity_matrix, customer_features['CustomerID'].values


In [5]:
# 3. Get top lookalikes
def get_top_lookalikes(customer_id, similarity_matrix, customer_ids, n=3):
    # Find index of customer
    customer_idx = np.where(customer_ids == customer_id)[0][0]

    # Get similarity scores for this customer
    similarity_scores = similarity_matrix[customer_idx]

    # Get indices of top similar customers (excluding self)
    similar_indices = np.argsort(similarity_scores)[::-1][1:n+1]

    # Get customer IDs and scores
    similar_customers = [
        (customer_ids[idx], similarity_scores[idx])
        for idx in similar_indices
    ]

    return similar_customers

In [7]:
# Main execution
def main():
    # Prepare features
    print("Preparing customer features...")
    customer_features = prepare_customer_features()

    # Create similarity model
    print("Creating similarity model...")
    similarity_matrix, customer_ids = create_similarity_model(customer_features)

    # Generate lookalikes for first 20 customers
    print("Generating lookalikes...")
    results = {}

    for i in range(20):
        customer_id = f'C{str(i+1).zfill(4)}'
        lookalikes = get_top_lookalikes(customer_id, similarity_matrix, customer_ids)
        results[customer_id] = [
            {'customer_id': cust_id, 'similarity_score': float(score)}
            for cust_id, score in lookalikes
        ]

    # Create output DataFrame
    output_rows = []
    for customer_id, lookalikes in results.items():
        row = {
            'customer_id': customer_id,
            'lookalike_1': lookalikes[0]['customer_id'],
            'score_1': round(lookalikes[0]['similarity_score'], 4),
            'lookalike_2': lookalikes[1]['customer_id'],
            'score_2': round(lookalikes[1]['similarity_score'], 4),
            'lookalike_3': lookalikes[2]['customer_id'],
            'score_3': round(lookalikes[2]['similarity_score'], 4)
        }
        output_rows.append(row)

    output_df = pd.DataFrame(output_rows)

    # Save to CSV
    output_df.to_csv('/content/Lookalike.csv', index=False)
    print("\nResults saved to Lookalike.csv")
    print("\nSample results:")
    print(output_df.head())

if __name__ == "__main__":
    main()

Preparing customer features...
Creating similarity model...
Generating lookalikes...

Results saved to Lookalike.csv

Sample results:
  customer_id lookalike_1  score_1 lookalike_2  score_2 lookalike_3  score_3
0       C0001       C0152   0.9532       C0174   0.9468       C0011   0.9364
1       C0002       C0159   0.9115       C0005   0.8961       C0106   0.8835
2       C0003       C0190   0.8571       C0129   0.8517       C0091   0.7606
3       C0004       C0113   0.9820       C0165   0.9700       C0102   0.9462
4       C0005       C0159   0.9715       C0007   0.9210       C0002   0.8961
