In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [10]:
def create_customer_features(customers_df, transactions_df, products_df):
    """Create robust customer features for similarity matching."""
    
    transactions_df = transactions_df.rename(columns={'Price': 'TransactionPrice'})
    products_df = products_df.rename(columns={'Price': 'BasePrice'})
    trans_prod = transactions_df.merge(products_df, on='ProductID')
    
    # 1. Purchase Behavior Features
    purchase_features = trans_prod.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Transaction frequency
        'TotalValue': ['sum', 'mean'],  # Spending patterns
        'Quantity': ['sum', 'mean'],  # Purchase volume
        'TransactionPrice': ['mean', 'std']  # Price patterns
    }).fillna(0)
    
    purchase_features.columns = [
        'num_transactions', 'total_spend', 'avg_transaction',
        'total_quantity', 'avg_quantity', 'avg_price', 'price_std'
    ]
    
    # Calculate price sensitivity (coefficient of variation)
    purchase_features['price_sensitivity'] = (
        purchase_features['price_std'] / purchase_features['avg_price'].replace(0, 1)
    )
    
    # 2. Category Preferences (as percentages)
    category_features = pd.pivot_table(
        trans_prod,
        values='TotalValue',
        index='CustomerID',
        columns='Category',
        aggfunc='sum',
        fill_value=0
    )
    
    category_totals = category_features.sum(axis=1)
    category_percentages = category_features.div(category_totals, axis=0).fillna(0)
    category_percentages.columns = [f'cat_pct_{col.lower().replace(" ", "_")}' 
                                  for col in category_percentages.columns]
    
    # 3. Time-based Features
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    time_features = transactions_df.groupby('CustomerID').agg({
        'TransactionDate': ['min', 'max']
    })
    time_features.columns = ['first_purchase', 'last_purchase']
    
    # Calculate days between first and last purchase
    time_features['days_active'] = (
        time_features['last_purchase'] - time_features['first_purchase']
    ).dt.days
    
    # 4. Region (one-hot encoded)
    region_features = pd.get_dummies(customers_df.set_index('CustomerID')['Region'], 
                                   prefix='region')
    
    # Combine all features
    customer_features = pd.concat([
        purchase_features,
        category_percentages,
        region_features
    ], axis=1).fillna(0)
    
    # Add derived metrics
    customer_features['avg_items_per_transaction'] = (
        customer_features['total_quantity'] / customer_features['num_transactions'].replace(0, 1)
    )
    
    # feature summary
    print("\nFeature Set Summary:")
    print("\nPurchase Features:")
    print(purchase_features.describe().round(2))
    print("\nCategory Preferences (sample):")
    print(category_percentages.head().round(2))
    print(f"\nTotal Features: {customer_features.shape[1]}")
    
    return customer_features

def calculate_similarity_scores(customer_features):
    """Calculate similarity scores between customers."""
    
    # Standardize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(customer_features)
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(features_scaled)
    
    return pd.DataFrame(
        similarity_matrix, 
        index=customer_features.index, 
        columns=customer_features.index
    )

def get_top_lookalikes(customer_id, similarity_df, n=3):
    """Get top N most similar customers."""
    
    # Get similarity scores for the customer
    customer_similarities = similarity_df[customer_id].sort_values(ascending=False)
    customer_similarities = customer_similarities[customer_similarities.index != customer_id]
    
    # Get top N similar customers
    top_similar = customer_similarities.head(n)
    
    return pd.DataFrame({
        'similar_customer': top_similar.index,
        'similarity_score': top_similar.values
    })

def create_lookalike_recommendations(customers_df, transactions_df, products_df, 
                                   start_id='C0001', end_id='C0020'):
    """Generate lookalike recommendations with proper error handling."""
    
    try:
        print("Creating customer features...")
        customer_features = create_customer_features(customers_df, transactions_df, products_df)
        
        print("\nCalculating similarity scores...")
        similarity_df = calculate_similarity_scores(customer_features)
        
        print("\nGenerating recommendations...")
        recommendations = {}
        customer_range = customers_df[
            (customers_df['CustomerID'] >= start_id) & 
            (customers_df['CustomerID'] <= end_id)
        ]['CustomerID']
        
        for customer_id in customer_range:
            top_similar = get_top_lookalikes(customer_id, similarity_df)
            recommendations[customer_id] = top_similar
            
        return recommendations
        
    except Exception as e:
        print(f"Error in recommendation generation: {str(e)}")
        raise

def save_recommendations_to_csv(recommendations, output_file='Lookalike.csv'):
    """Save recommendations in the required format."""
    
    rows = []
    for cust_id, similar_customers in recommendations.items():
        similar_list = [
            f"{row['similar_customer']}:{row['similarity_score']:.4f}"
            for _, row in similar_customers.iterrows()
        ]
        rows.append({
            'customer_id': cust_id,
            'similar_customers': '|'.join(similar_list)
        })
    
    recommendations_df = pd.DataFrame(rows)
    recommendations_df.to_csv(output_file, index=False)
    print(f"\nRecommendations saved to {output_file}")
    return recommendations_df

In [11]:
print("Loading data...")
customers_df = pd.read_csv('data/Customers.csv')
products_df = pd.read_csv('data/Products.csv')
transactions_df = pd.read_csv('data/Transactions.csv')

# Generate recommendations
recommendations = create_lookalike_recommendations(
    customers_df, transactions_df, products_df
)

# Save and display results
recommendations_df = save_recommendations_to_csv(recommendations)
print("\nSample Recommendations:")
print(recommendations_df.head())

Loading data...
Creating customer features...

Feature Set Summary:

Purchase Features:
       num_transactions  total_spend  avg_transaction  total_quantity  \
count            199.00       199.00           199.00          199.00   
mean               5.03      3467.31           687.58           12.75   
std                2.20      1832.68           237.94            6.15   
min                1.00        82.36            82.36            1.00   
25%                3.00      2162.04           542.94            8.50   
50%                5.00      3137.66           677.21           12.00   
75%                6.00      4770.23           828.62           17.00   
max               11.00     10673.87          1323.13           32.00   

       avg_quantity  avg_price  price_std  price_sensitivity  
count        199.00     199.00     199.00             199.00  
mean           2.53     271.75     128.59               0.51  
std            0.56      69.68      55.10               0.26  
mi