In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Encode categorical variables
le_region = LabelEncoder()
customers_df["RegionEncoded"] = le_region.fit_transform(customers_df["Region"])

# Merge transactions with customer data
customer_transactions = transactions_df.merge(customers_df, on="CustomerID", how="left")

# Aggregate transaction features per customer
customer_features = customer_transactions.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    total_transactions=("TransactionID", "count"),
    avg_transaction_value=("TotalValue", "mean"),
    region=("RegionEncoded", "first")
).reset_index()

# Normalize numerical features
customer_features_norm = customer_features.copy()
customer_features_norm[["total_spent", "total_transactions", "avg_transaction_value"]] = \
    (customer_features_norm[["total_spent", "total_transactions", "avg_transaction_value"]] - 
     customer_features_norm[["total_spent", "total_transactions", "avg_transaction_value"]].min()) / \
    (customer_features_norm[["total_spent", "total_transactions", "avg_transaction_value"]].max() - 
     customer_features_norm[["total_spent", "total_transactions", "avg_transaction_value"]].min())

# Compute similarity matrix
customer_similarity_matrix = cosine_similarity(
    customer_features_norm[["total_spent", "total_transactions", "avg_transaction_value", "region"]]
)

# Create a mapping of customer similarity scores
customer_ids = customer_features["CustomerID"].tolist()
lookalike_dict = {}

for idx, customer_id in enumerate(customer_ids[:20]):  # First 20 customers
    similarity_scores = list(enumerate(customer_similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3_lookalikes = [(customer_ids[i], score) for i, score in similarity_scores[1:4]]  # Top 3 excluding itself
    lookalike_dict[customer_id] = top_3_lookalikes

# Convert lookalike dictionary to DataFrame
lookalike_df = pd.DataFrame([{"CustomerID": cust, "Lookalikes": str(lookalikes)} for cust, lookalikes in lookalike_dict.items()])

# Save the Lookalike model results
lookalike_df.to_csv("/mnt/data/Vaishwik_Vishwakarma_Lookalike.csv", index=False)

# Display the Lookalike results
import ace_tools as tools
tools.display_dataframe_to_user(name="Lookalike Model Results", dataframe=lookalike_df)
