In [17]:
# Import necessary Libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [18]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [19]:
# Merge datasets for feature engineering
# Merge datasets for analysis
merged_data = transactions.merge(customers, on="CustomerID", how="left")
merged_data = merged_data.merge(products, on="ProductID", how="left")

In [20]:
# Feature Engineering
# Aggregate product preferences and transaction behavior for each customer
customer_profiles = merged_data.groupby("CustomerID").agg({
    "Category": lambda x: ",".join(x),  # Combine product categories as text
    "Price_y": "mean",  # Average price of products purchased
    "TotalValue": "sum",  # Total spending
    "Quantity": "sum"  # Total quantity purchased
}).reset_index()

In [21]:
# Normalize numerical features
scaler = MinMaxScaler()
customer_profiles[["Price_y", "TotalValue", "Quantity"]] = scaler.fit_transform(
    customer_profiles[["Price_y", "TotalValue", "Quantity"]]
)

In [22]:
# Create textual features for TF-IDF
tfidf_vectorizer = TfidfVectorizer()
category_matrix = tfidf_vectorizer.fit_transform(customer_profiles["Category"])

In [23]:
# Combine textual and numerical features
numerical_features = customer_profiles[["Price_y", "TotalValue", "Quantity"]].values
combined_features = np.hstack([category_matrix.toarray(), numerical_features])

In [24]:
# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(combined_features)

In [25]:
# Find top 3 similar customers for each customer
lookalike_data = {}
for i, customer_id in enumerate(customer_profiles["CustomerID"]):
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)  # Sort by similarity
    top_3 = [
        (customer_profiles.iloc[j[0]]["CustomerID"], j[1])
        for j in similarities if customer_profiles.iloc[j[0]]["CustomerID"] != customer_id
    ][:3]  # Exclude the same customer and get top 3
    lookalike_data[customer_id] = top_3

In [26]:
# Create Lookalike.csv with top 20 customers
lookalike_output = []
for customer_id in customer_profiles["CustomerID"][:20]:
    recommendations = lookalike_data[customer_id]
    lookalike_output.append({
        "CustomerID": customer_id,
        "Lookalikes": [{"CustomerID": rec[0], "Score": rec[1]} for rec in recommendations]
    })

In [27]:
# Convert to DataFrame
lookalike_df = pd.DataFrame({
    "CustomerID": [row["CustomerID"] for row in lookalike_output],
    "Lookalikes": [row["Lookalikes"] for row in lookalike_output]
})

In [28]:
# Save Lookalike.csv
lookalike_df.to_csv("Lookalike.csv", index=False)