In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
import numpy as np

# Load datasets
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

# Merge datasets
customer_transactions = transactions_df.merge(customers_df, on="CustomerID")
merged_data = customer_transactions.merge(products_df, on="ProductID")

# Rename the correct Price column
merged_data = merged_data.rename(columns={"Price_y": "Price"})

# Create customer profiles
customer_profiles = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "Quantity": "sum",    # Total quantity purchased
    "Price": "mean",      # Average price of products
    "Region": "first",    # Customer's region
    "Category": lambda x: x.mode()[0]  # Most frequent category
}).reset_index()

# Encode categorical features (Region and Category)
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_profiles[["Region", "Category"]]).toarray()

# Normalize numerical features
numerical_features = customer_profiles[["TotalValue", "Quantity", "Price"]].values
scaler = MinMaxScaler()
scaled_numerical = scaler.fit_transform(numerical_features)

# Combine numerical and encoded features
customer_features = np.hstack((scaled_numerical, encoded_features))

# Compute cosine similarity for all customers
similarity_matrix = cosine_similarity(customer_features)

# Extract top 3 similar customers for each target customer (C0001-C0020)
target_customers = customer_profiles["CustomerID"][:20].values
lookalike_data = {}

for i, target_customer in enumerate(target_customers):
    # Get similarity scores for the target customer
    customer_idx = np.where(customer_profiles["CustomerID"] == target_customer)[0][0]
    similarities = similarity_matrix[customer_idx]
    
    # Find top 3 similar customers (excluding self)
    similar_indices = similarities.argsort()[-4:-1][::-1]  # Exclude self and sort by similarity
    similar_customers = customer_profiles.iloc[similar_indices]["CustomerID"].values
    similar_scores = similarities[similar_indices]
    
    # Save results in the lookalike map
    lookalike_data[target_customer] = list(zip(similar_customers, similar_scores))

# Convert lookalike data into a DataFrame and save as CSV
lookalike_df = pd.DataFrame([{"cust_id": k, "lookalikes": v} for k, v in lookalike_data.items()])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike.csv generated successfully!")


Lookalike.csv generated successfully!
