<a href="https://colab.research.google.com/github/SandeepGandham4/Data-Science-Intern-Assignment-Zeotap/blob/main/Sandeep_Gandham_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Task 2: Lookalike Model

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load Data (Replace with actual file paths)
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge Data
transactions = transactions.merge(products, on="ProductID", how="left")
data = transactions.merge(customers, on="CustomerID", how="left")

# Feature Engineering
customer_features = data.groupby("CustomerID").agg(
    total_spending=("TotalValue", "sum"),
    avg_transaction_value=("TotalValue", "mean"),
    total_transactions=("TransactionID", "count")
).reset_index()

# Find the favorite category for each customer
favorite_category = (
    data.groupby(["CustomerID", "Category"])
    .size()
    .reset_index(name="count")
    .sort_values(["CustomerID", "count"], ascending=[True, False])
    .drop_duplicates(subset=["CustomerID"])
)

# Merge favorite category into customer features
customer_features = customer_features.merge(
    favorite_category[["CustomerID", "Category"]].rename(columns={"Category": "favorite_category"}),
    on="CustomerID",
    how="left"
)

# One-hot encode the 'favorite_category' column
customer_features = pd.get_dummies(customer_features, columns=["favorite_category"], drop_first=True)

# Standardize Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])  # Exclude CustomerID

# Compute Similarities
similarity_matrix = cosine_similarity(scaled_features)

# Generate Recommendations
lookalike_map = {}
customer_ids = customer_features["CustomerID"].values
for idx, customer_id in enumerate(customer_ids):
    similarity_scores = similarity_matrix[idx]
    top_indices = similarity_scores.argsort()[::-1][1:4]  # Exclude the customer itself
    lookalike_map[customer_id] = [
        (customer_ids[i], round(similarity_scores[i], 3)) for i in top_indices
    ]

# Save Output
lookalike_df = pd.DataFrame({
    "CustomerID": list(lookalike_map.keys()),
    "Lookalikes": [str(lookalike_map[cust_id]) for cust_id in lookalike_map]
})
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model completed. Output saved to Lookalike.csv.")


Lookalike model completed. Output saved to Lookalike.csv.
