In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load customers, products, and transactions
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Convert dates to datetime format
customers["SignupDate"] = pd.to_datetime(customers["SignupDate"])
transactions["TransactionDate"] = pd.to_datetime(transactions["TransactionDate"])

In [3]:
# Create customer-product interaction matrix
interaction_matrix = transactions.pivot_table(index="CustomerID", columns="ProductID", values="Quantity", aggfunc="sum").fillna(0)

# Normalize the interaction matrix
scaler = StandardScaler()
interaction_matrix_scaled = scaler.fit_transform(interaction_matrix)

In [4]:
# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(interaction_matrix_scaled)

# Convert similarity matrix into a DataFrame
customer_similarity_df = pd.DataFrame(similarity_matrix, index=interaction_matrix.index, columns=interaction_matrix.index)

In [5]:
# Function to get top 3 similar customers for a given customer
def get_top_3_similar(customers_df, customer_id):
    similar_customers = customers_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude self
    return list(similar_customers.index), list(similar_customers.values)

# Generate Lookalike.csv for the first 20 customers (C0001 - C0020)
lookalike_data = []

for customer_id in customers["CustomerID"][:20]:  # First 20 customers
    lookalikes, scores = get_top_3_similar(customer_similarity_df, customer_id)
    lookalike_data.append([customer_id, lookalikes[0], scores[0], lookalikes[1], scores[1], lookalikes[2], scores[2]])

# Convert to DataFrame
lookalike_df = pd.DataFrame(lookalike_data, columns=["cust_id", "lookalike1", "score1", "lookalike2", "score2", "lookalike3", "score3"])

# Save as CSV
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)