In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load data from CSV files
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Feature engineering: Create a customer-product interaction matrix
customer_product_matrix = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID').pivot_table(
    index='CustomerID',
    columns='ProductID',
    values='Quantity',
    aggfunc='sum',
    fill_value=0
)

# Normalize the data
scaler = StandardScaler()
normalized_matrix = scaler.fit_transform(customer_product_matrix)

In [3]:
# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(normalized_matrix)
similarity_df = pd.DataFrame(similarity_matrix,
                             index=customer_product_matrix.index,
                             columns=customer_product_matrix.index)

In [4]:
# Function to get top N similar customers
def get_top_similar(customers_df, similarity_df, customer_id, top_n=3):
    similar_customers = (
        similarity_df[customer_id]
        .sort_values(ascending=False)
        .iloc[1:top_n + 1]  # Skip the first one (self-similarity)
    )
    return list(zip(similar_customers.index, similar_customers.values))

In [5]:
# Generate lookalikes for the first 20 customers
lookalike_map = {}
for customer_id in customers['CustomerID'][:20]:
    lookalike_map[customer_id] = get_top_similar(customers, similarity_df, customer_id)

# Create Lookalike.csv
lookalike_data = []
for cust_id, lookalikes in lookalike_map.items():
    for similar_cust_id, score in lookalikes:
        lookalike_data.append({"cust_id": cust_id, "similar_cust_id": similar_cust_id, "score": score})

lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)