In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import cdist

def create_customer_features(customers_df, transactions_df, products_df):
    """Create feature matrix for customers based on their profile and transaction history"""

    # Merge transactions with products to get category information
    trans_prod = transactions_df.merge(products_df, on='ProductID')

    # Calculate customer transaction features
    customer_features = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Number of transactions
        'TotalValue': ['sum', 'mean'],  # Total and average spend
        'Quantity': ['sum', 'mean']  # Total and average quantity
    }).round(2)

    # Flatten column names
    customer_features.columns = ['transaction_count', 'total_spend',
                               'avg_transaction_value', 'total_quantity',
                               'avg_quantity']

    # Calculate category preferences (percentage of spend in each category)
    category_pivot = pd.pivot_table(
        trans_prod,
        values='TotalValue',
        index='CustomerID',
        columns='Category',
        aggfunc='sum',
        fill_value=0
    )

    # Convert to percentage of total spend
    category_percentages = category_pivot.div(category_pivot.sum(axis=1), axis=0)
    category_percentages.columns = [f'category_perc_{col.lower()}' for col in category_percentages.columns]

    # Merge all features
    customer_features = customer_features.merge(
        category_percentages,
        left_index=True,
        right_index=True,
        how='left'
    )

    # Add customer region as one-hot encoded features
    region_dummies = pd.get_dummies(customers_df.set_index('CustomerID')['Region'],
                                  prefix='region')
    customer_features = customer_features.merge(
        region_dummies,
        left_index=True,
        right_index=True,
        how='left'
    )

    # Fill any missing values with 0
    customer_features = customer_features.fillna(0)

    return customer_features

def find_lookalikes(target_id, customer_features, n_recommendations=3):
    """Find top N similar customers for a given target customer"""

    # Scale features
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(customer_features)

    # Convert back to DataFrame with original index
    scaled_features = pd.DataFrame(
        scaled_features,
        index=customer_features.index,
        columns=customer_features.columns
    )

    # Get target customer features
    target_features = scaled_features.loc[target_id].values.reshape(1, -1)

    # Calculate distances to all other customers
    distances = cdist(target_features, scaled_features, metric='euclidean')[0]

    # Create DataFrame with distances
    distance_df = pd.DataFrame({
        'CustomerID': customer_features.index,
        'Distance': distances
    })

    # Sort by distance and get top N (excluding the target customer)
    similar_customers = distance_df[distance_df['CustomerID'] != target_id].nsmallest(
        n_recommendations, 'Distance'
    )

    # Convert distance to similarity score (inversely proportional to distance)
    similar_customers['SimilarityScore'] = 1 / (1 + similar_customers['Distance'])

    # Normalize similarity scores to 0-1 range
    similar_customers['SimilarityScore'] = (similar_customers['SimilarityScore'] -
                                          similar_customers['SimilarityScore'].min()) / \
                                         (similar_customers['SimilarityScore'].max() -
                                          similar_customers['SimilarityScore'].min())

    return similar_customers[['CustomerID', 'SimilarityScore']]

# Main execution
def generate_lookalike_recommendations():
    # Load datasets
    customers_df = pd.read_csv('Customers.csv')
    products_df = pd.read_csv('Products.csv')
    transactions_df = pd.read_csv('Transactions.csv')

    # Create feature matrix
    customer_features = create_customer_features(customers_df, transactions_df, products_df)

    # Generate recommendations for customers C0001-C0020
    recommendations = {}
    for cust_id in [f'C{str(i).zfill(4)}' for i in range(1, 21)]:
        if cust_id in customer_features.index:
            similar_customers = find_lookalikes(cust_id, customer_features)
            recommendations[cust_id] = similar_customers.to_dict('records')

    # Create output DataFrame
    output_rows = []
    for target_id, similars in recommendations.items():
        row = {
            'target_customer': target_id,
            'similar_customer1': similars[0]['CustomerID'],
            'score1': round(similars[0]['SimilarityScore'], 3),
            'similar_customer2': similars[1]['CustomerID'],
            'score2': round(similars[1]['SimilarityScore'], 3),
            'similar_customer3': similars[2]['CustomerID'],
            'score3': round(similars[2]['SimilarityScore'], 3)
        }
        output_rows.append(row)

    output_df = pd.DataFrame(output_rows)

    # Save to CSV
    output_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)

    return output_df

# Run the model
lookalike_results = generate_lookalike_recommendations()
print("\nSample of lookalike recommendations:")
print(lookalike_results.head())


Sample of lookalike recommendations:
  target_customer similar_customer1  score1 similar_customer2  score2  \
0           C0001             C0181     1.0             C0192   0.627   
1           C0002             C0159     1.0             C0106   0.495   
2           C0003             C0031     1.0             C0195   0.709   
3           C0004             C0113     1.0             C0104   0.306   
4           C0005             C0007     1.0             C0146   0.008   

  similar_customer3  score3  
0             C0120     0.0  
1             C0178     0.0  
2             C0091     0.0  
3             C0012     0.0  
4             C0186     0.0  
