In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Convert date columns to datetime objects
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets to create a combined dataset
combined_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Calculate customer-level transaction metrics
customer_metrics = combined_data.groupby('CustomerID').agg(
    AvgTransactionValue=('TotalValue', 'mean'),
    TotalQuantity=('Quantity', 'sum'),
    NumTransactions=('TransactionID', 'count')
).reset_index()

# Calculate the number of days since the first signup date
first_signup_date = customers['SignupDate'].min()
customer_metrics['SignupDays'] = (customer_metrics['CustomerID'].map(customers.set_index('CustomerID')['SignupDate']) - first_signup_date).dt.days

# Calculate customer category preferences
category_pivot = combined_data.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
category_pivot = category_pivot.pivot(index='CustomerID', columns='Category', values='Count').fillna(0)

# Normalize to get the proportion of each category purchased by the customer
category_pivot = category_pivot.div(category_pivot.sum(axis=1), axis=0).reset_index()

# Merge all customer features
customer_data = customer_metrics.merge(category_pivot, on='CustomerID', how='left').fillna(0)
customer_data = customer_data.merge(customers[['CustomerID', 'Region']], on='CustomerID', how='left')

# Define feature groups
numerical_features = ['SignupDays', 'AvgTransactionValue', 'TotalQuantity', 'NumTransactions']
categorical_features = ['Region']
category_features = list(category_pivot.columns[1:])  # Exclude 'CustomerID'
all_features = numerical_features + categorical_features + category_features

# Preprocessor for scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # Keep category features
)

# Apply preprocessing
processed_features = preprocessor.fit_transform(customer_data[all_features])

# Create DataFrame with processed features
processed_features_df = pd.DataFrame(
    processed_features,
    columns=preprocessor.get_feature_names_out(),
    index=customer_data['CustomerID']
)

# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(processed_features_df)
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=processed_features_df.index, 
                             columns=processed_features_df.index)


def get_top_lookalikes(customer_id, similarity_df, top_n=3):
    """
    Get the top N lookalike customers for a given customer ID.
    
    Args:
        customer_id (str): The customer ID for which to find lookalikes.
        similarity_df (pd.DataFrame): Cosine similarity matrix.
        top_n (int): Number of top lookalikes to return.
    
    Returns:
        List[Tuple[str, float]]: A list of tuples containing lookalike customer IDs and their similarity scores.
    """
    if customer_id not in similarity_df.index:
        raise ValueError(f"Customer ID {customer_id} not found in similarity matrix.")
    
    # Exclude self-similarity by setting diagonal to -1
    similarity_df.loc[customer_id, customer_id] = -1
    
    # Get top N most similar customers
    top_customers = similarity_df.loc[customer_id].nlargest(top_n)
    return [(cust_id, score) for cust_id, score in zip(top_customers.index, top_customers.values)]


# Generate lookalike map for the first 20 customers dynamically
lookalike_map = {}
for customer in customer_data['CustomerID'][:20]:
    try:
        lookalike_map[customer] = get_top_lookalikes(customer, similarity_df)
    except Exception as e:
        print(f"Error processing customer {customer}: {e}")

# Save lookalike results to CSV
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': str(lookalikes)} 
    for cust_id, lookalikes in lookalike_map.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike map saved as 'Lookalike.csv'.")


Lookalike map saved as 'Lookalike.csv'.
