In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df_customers = pd.read_csv("/kaggle/input/data-zeotap/Customers.csv")
df_products = pd.read_csv("/kaggle/input/data-zeotap/Products.csv")
df_transactions = pd.read_csv("/kaggle/input/data-zeotap/Transactions.csv")

In [4]:
df_merged = df_transactions.merge(df_customers, on='CustomerID', how='left')
df_merged = df_merged.merge(df_products, on='ProductID', how='left')

In [5]:
customer_features = df_merged.groupby("CustomerID").agg({
    "TotalValue": "sum",       # total revenue from this customer
    "TransactionID": "count",  # transaction frequency
}).rename(columns={"TransactionID":"transaction_count",
                   "TotalValue":"total_spend"})

In [6]:
region_dummies = pd.get_dummies(df_customers.set_index("CustomerID")['Region'], prefix="region")
customer_features = customer_features.join(region_dummies, how="left")

In [7]:
df_customers['SignupDate'] = pd.to_datetime(df_customers['SignupDate'], errors='coerce')
max_signup_date = df_customers['SignupDate'].max()
df_customers['days_since_signup'] = (max_signup_date - df_customers['SignupDate']).dt.days
customer_features['days_since_signup'] = df_customers.set_index("CustomerID")['days_since_signup']


In [8]:
customer_features.fillna(0, inplace=True)

In [9]:
scaler = StandardScaler()
feature_cols = customer_features.columns
X = scaler.fit_transform(customer_features[feature_cols])

In [10]:
similarity_matrix = cosine_similarity(X)

In [12]:
customer_ids = customer_features.index.tolist()
sim_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)


In [13]:
lookalike_dict = {}

In [14]:
for cust_id in customer_ids:
    # We only do this for the first 20 (C0001 to C0020)
    if cust_id >= "C0001" and cust_id <= "C0020":
        # Sort by descending similarity, remove the customer itself
        sim_scores = sim_df.loc[cust_id].drop(labels=[cust_id])
        sim_sorted = sim_scores.sort_values(ascending=False)
        
        top_3 = sim_sorted.head(3)
        # Convert to a list of (customer_id, similarity_score)
        top_3_list = list(zip(top_3.index, top_3.values))
        
        lookalike_dict[cust_id] = top_3_list

# 6. SAVE TO CSV
output_rows = []
for cust_id, lookalikes in lookalike_dict.items():
    output_rows.append({
        "customer_id": cust_id,
        "lookalikes": str(lookalikes)  # or you can store more elegantly
    })

df_lookalikes = pd.DataFrame(output_rows)
df_lookalikes.to_csv("FirstName_LastName_Lookalike.csv", index=False)
print("Lookalike CSV saved as FirstName_LastName_Lookalike.csv")

Lookalike CSV saved as FirstName_LastName_Lookalike.csv
