In [9]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

# Merge datasets
transactions_products = transactions.merge(products, on="ProductID", how="inner")
customer_data = transactions_products.merge(customers, on="CustomerID", how="inner")

# Feature Engineering
# Aggregate transaction data per customer
customer_features = customer_data.groupby("CustomerID").agg(
    TotalSpend=("TotalValue", "sum"),
    TotalTransactions=("TransactionID", "count"),
    AvgTransactionValue=("TotalValue", "mean"),
    FavoriteCategory=("Category", lambda x: x.mode()[0]),  # Most frequent category
    UniqueProducts=("ProductID", "nunique")
).reset_index()

# Add encoded region and signup date
le_region = LabelEncoder()
customers["RegionEncoded"] = le_region.fit_transform(customers["Region"])
customer_features = customer_features.merge(customers[["CustomerID", "RegionEncoded", "SignupDate"]], on="CustomerID")

# Encode categorical features
le_category = LabelEncoder()
customer_features["FavoriteCategoryEncoded"] = le_category.fit_transform(customer_features["FavoriteCategory"])

# Drop non-numeric columns for similarity calculation
customer_features_numeric = customer_features.drop(columns=["CustomerID", "FavoriteCategory", "SignupDate"])



In [11]:
import json

# Ensure CustomerID is treated as a string
customers["CustomerID"] = customers["CustomerID"].astype(str)
transactions["CustomerID"] = transactions["CustomerID"].astype(str)
products["ProductID"] = products["ProductID"].astype(str)

# Updated recommend_similar_customers function
def recommend_similar_customers(input_customer_id, top_n=3):
    input_vector = customer_features_numeric[customer_features["CustomerID"] == input_customer_id].values
    similarity_scores = cosine_similarity(input_vector, customer_features_numeric.values).flatten()
    customer_features["SimilarityScore"] = similarity_scores
    recommendations = customer_features[customer_features["CustomerID"] != input_customer_id].nlargest(top_n, "SimilarityScore")
    return recommendations[["CustomerID", "SimilarityScore"]].to_dict(orient="records")

# Initialize an empty list to store lookalike mappings
lookalike_data = []

# Generate lookalikes for the first 20 customers (C0001 to C0020)
for cust_id in [f"C{i:04}" for i in range(1, 21)]:
    top_lookalikes = recommend_similar_customers(cust_id)
    lookalike_data.append({
        "cust_id": cust_id,
        "lookalikes": top_lookalikes
    })

# Convert the list to a DataFrame
lookalike_df = pd.DataFrame(lookalike_data)

# Save as a CSV file
lookalike_df.to_csv("Roshni_Seth_Lookalike.csv", index=False)
