<a href="https://colab.research.google.com/github/Shaikumar2005/ZeoTap-Data-Science-Intern/blob/main/Shai_Kumar_R_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
merged = pd.merge(transactions, customers, on="CustomerID")
merged = pd.merge(merged, products, on="ProductID")

# Feature Engineering
# Aggregate transaction data for each customer
customer_features = merged.groupby("CustomerID").agg(
    total_spent=("TotalValue", "sum"),
    avg_quantity=("Quantity", "mean"),
    purchase_count=("TransactionID", "count"),
    most_frequent_category=("Category", lambda x: x.mode()[0]),
    region=("Region", "first"),
    signup_date=("SignupDate", "first"),
).reset_index()

# Encode categorical features
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(
    customer_features[["most_frequent_category", "region"]]
).toarray()

# Scale numerical features
scaler = StandardScaler()
numerical_features = scaler.fit_transform(
    customer_features[["total_spent", "avg_quantity", "purchase_count"]]
)

# Combine features
features = np.hstack([numerical_features, categorical_features])

# Compute similarity matrix
similarity_matrix = cosine_similarity(features)

# Find lookalikes for each customer
lookalike_map = {}
customer_ids = customer_features["CustomerID"].tolist()

for idx, customer_id in enumerate(customer_ids[:20]):  # Limit to first 20 customers
    similarities = similarity_matrix[idx]
    similar_indices = np.argsort(similarities)[::-1][1:4]  # Top 3 similar customers
    lookalikes = [(customer_ids[i], similarities[i]) for i in similar_indices]
    lookalike_map[customer_id] = lookalikes

# Create Lookalike.csv
lookalike_df = pd.DataFrame(
    [
        {
            "cust_id": cust_id,
            "lookalike_cust_id_1": lookalikes[0][0],
            "score_1": lookalikes[0][1],
            "lookalike_cust_id_2": lookalikes[1][0],
            "score_2": lookalikes[1][1],
            "lookalike_cust_id_3": lookalikes[2][0],
            "score_3": lookalikes[2][1],
        }
        for cust_id, lookalikes in lookalike_map.items()
    ]
)

lookalike_df.to_csv("Shai_Kumar R_Lookalike.csv", index=False)

print("Lookalike.csv created successfully!")


Lookalike.csv created successfully!
