<a href="https://colab.research.google.com/github/VinayBorate/Zeotap_assignment/blob/main/Task2_Lookalike_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [2]:
# Read the CSV files
customers = pd.read_csv("/content/Customers.csv")
products = pd.read_csv("/content/Products.csv")
transactions = pd.read_csv("/content/Transactions.csv")

In [3]:
merged = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [4]:
print(merged.columns)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [5]:
if 'Price_x' in transactions.columns and 'Price_y' in transactions.columns:
    print("Price_x and Price_y Columns:\n")
    print(transactions[['Price_x', 'Price_y']].head())
else:
    print("Not")

Not


In [6]:
# Feature Engineering
customer_profiles = merged.groupby('CustomerID').agg({
    'TotalValue': 'mean',
    'Quantity': 'sum',
    'Price_y': 'mean',
    'Category': lambda x: x.mode()[0]  # Most frequent category
}).reset_index()

In [7]:
customer_profiles_encoded = pd.get_dummies(customer_profiles, columns=['Category'], drop_first=True)

In [8]:
# Normalize data
scaler = StandardScaler()
profile_features = scaler.fit_transform(customer_profiles_encoded.drop('CustomerID', axis=1))

In [9]:
# Compute similarity
similarity_matrix = cosine_similarity(profile_features)

In [10]:
# Generate Lookalike Recommendations
lookalike_results = {}
for idx, customer in enumerate(customer_profiles['CustomerID'][:20]):
    similar_indices = np.argsort(-similarity_matrix[idx])[1:4]  # Top 3 excluding itself
    lookalike_results[customer] = [
        (customer_profiles['CustomerID'][i], similarity_matrix[idx][i]) for i in similar_indices
    ]

In [11]:
# Save Lookalike.csv
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalikes': lookalike_results.values()
})
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', index=False)