# Task 2: Lookalike Model

# Data set
Products.csv :
https://drive.google.com/file/d/1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0/view?usp=sharing

# Files Description:
1. Customers.csv
○ CustomerID: Unique identifier for each customer.
○ CustomerName: Name of the customer.
○ Region: Continent where the customer resides.
○ SignupDate: Date when the customer signed up.

2. Products.csv
○ ProductID: Unique identifier for each product.
○ ProductName: Name of the product.
○ Category: Product category.
○ Price: Product price in USD.

# Importing liberaries 

In [3]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Load data
customers = pd.read_csv("C:/Users/ARCHANA/Desktop/Zeotap/Customers.csv", parse_dates=['SignupDate'])
products = pd.read_csv("C:/Users/ARCHANA/Desktop/Zeotap/Products.csv")

In [5]:
# Create product category averages
category_avg = products.groupby('Category')['Price'].mean().to_dict()

In [6]:
# Feature engineering for customers
customers['SignupYear'] = customers['SignupDate'].dt.year
customers['SignupMonth'] = customers['SignupDate'].dt.month

In [7]:
# Add category averages to customer data
for category in category_avg:
    customers[f'Avg_{category}_Price'] = category_avg[category]


In [8]:
# Encode regions
customers = pd.get_dummies(customers, columns=['Region'], prefix='Region')

In [9]:
# Create feature matrix
features = ['SignupYear', 'SignupMonth'] + \
           [f'Region_{r}' for r in ['Asia', 'Europe', 'North America', 'South America']] + \
           [f'Avg_{cat}_Price' for cat in category_avg]


X = customers[features]

In [10]:
# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(X_scaled)

In [12]:
# Create recommendations dictionary
lookalikes = {}
for idx in range(20):
    customer_id = f"C{str(idx+1).zfill(4)}"
    scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude self
    lookalikes[customer_id] = [
        (f"C{str(i+1).zfill(4)}", round(score, 3))
        for (i, score) in sorted_scores
    ]

In [18]:
# Create output with proper structure
output_data = []
for cust_id, similarities in lookalikes.items():
    formatted_similarities = [f"{sim_id}:{score}" for sim_id, score in similarities]
    output_data.append({"CustomerID": cust_id, "Lookalikes": formatted_similarities})

output = pd.DataFrame(output_data)
output.to_csv('Lookalike.csv', index=False)