In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

In [24]:
def load_and_prepare_data():
    # Load datasets
    customers = pd.read_csv('Customers.csv')
    products = pd.read_csv('Products.csv')
    transactions = pd.read_csv('Transactions.csv')
    
    # Convert dates to datetime
    customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
    transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])
    
    return customers, products, transactions

In [25]:
def create_customer_features(customers, transactions, products):
    # Calculate customer-level transaction metrics
    transaction_metrics = transactions.groupby('CustomerID').agg({
        'TransactionID': 'count',  # Number of transactions
        'TotalValue': ['sum', 'mean'],  # Total spend and average spend per transaction
        'Quantity': ['sum', 'mean']  # Total quantity and average quantity per transaction
    }).reset_index()
    
    # Flatten column names
    transaction_metrics.columns = ['CustomerID', 'num_transactions', 'total_spend', 
                                 'avg_transaction_value', 'total_quantity', 'avg_quantity']
    
    # Calculate category preferences
    category_preferences = (
        transactions
        .merge(products[['ProductID', 'Category']], on='ProductID')
        .groupby(['CustomerID', 'Category'])
        .size()
        .unstack(fill_value=0)
    )
    
    # Calculate recency
    latest_transaction = transactions.groupby('CustomerID')['TransactionDate'].max()
    first_transaction = transactions.groupby('CustomerID')['TransactionDate'].min()
    customer_recency = (latest_transaction - first_transaction).dt.days
    
    # Combine all features
    customer_features = (
        customers[['CustomerID', 'Region', 'SignupDate']]
        .merge(transaction_metrics, on='CustomerID')
        .merge(category_preferences, left_on='CustomerID', right_index=True)
        .merge(customer_recency.reset_index(name='account_age_days'), on='CustomerID')
    )
    
    # Convert Region to dummy variables
    customer_features = pd.get_dummies(customer_features, columns=['Region'])
    
    # Convert SignupDate to account age in days
    reference_date = pd.Timestamp('2024-01-27')  # Using current date as reference
    customer_features['signup_age_days'] = (
        reference_date - customer_features['SignupDate']
    ).dt.days
    
    customer_features.drop('SignupDate', axis=1, inplace=True)
    
    return customer_features


In [26]:
def find_lookalikes(customer_features, target_customer_id, n_recommendations=3):
    # Standardize features
    scaler = StandardScaler()
    feature_columns = customer_features.columns.drop('CustomerID')
    scaled_features = scaler.fit_transform(customer_features[feature_columns])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(scaled_features)
    
    # Get index of target customer
    target_idx = customer_features[customer_features['CustomerID'] == target_customer_id].index[0]
    
    # Get similarities for target customer
    similarities = similarity_matrix[target_idx]
    
    # Get indices of top N similar customers (excluding self)
    similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]
    
    # Create recommendations dataframe
    recommendations = pd.DataFrame({
        'similar_customer_id': customer_features.iloc[similar_indices]['CustomerID'].values,
        'similarity_score': similarities[similar_indices]
    })
    
    return recommendations



In [27]:
def generate_lookalike_recommendations(customer_ids):
    # Load and prepare data
    customers, products, transactions = load_and_prepare_data()
    
    # Create customer features
    customer_features = create_customer_features(customers, transactions, products)
    
    # Generate recommendations for each customer
    recommendations = {}
    
    for customer_id in customer_ids:
        lookalikes = find_lookalikes(customer_features, customer_id)
        recommendations[customer_id] = lookalikes.to_dict('records')
    
    return recommendations


In [28]:
def save_recommendations_to_csv(recommendations, output_file):
    # Convert recommendations to desired format
    results = []
    for customer_id, lookalikes in recommendations.items():
        similar_customers = [
            f"{rec['similar_customer_id']}:{rec['similarity_score']:.4f}"
            for rec in lookalikes
        ]
        results.append({
            'customer_id': customer_id,
            'lookalikes': '|'.join(similar_customers)
        })
    
    # Save to CSV
    pd.DataFrame(results).to_csv(output_file, index=False)

In [29]:
target_customers = [f'C{str(i).zfill(4)}' for i in range(1, 21)]
recommendations = generate_lookalike_recommendations(target_customers)
    
# Save results
save_recommendations_to_csv(recommendations, 'Nityam_Nityam_Lookalike.csv')