#### *Task 2: Lookalike Model*

In [2]:
# Importing Libraries

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Load datasets

customers = pd.read_csv("C:/Users/SHUBHAM/Downloads/Customers.csv")
products = pd.read_csv("C:/Users/SHUBHAM/Downloads/Products.csv")
transactions = pd.read_csv("C:/Users/SHUBHAM/Downloads/Transactions.csv")

In [4]:
# Merge datasets for comprehensive analysis

merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [5]:
# Feature Engineering
# Aggregate transaction data for each customer

customer_profiles = merged_data.groupby('CustomerID').agg({
    'Region': 'first',
    'Category': lambda x: ' '.join(x),  # Combine purchased product categories
    'TotalValue': 'sum',               # Total spending
    'Quantity': 'sum'                  # Total quantity purchased
}).reset_index()

In [6]:
# Combine text-based features

customer_profiles['ProfileText'] = customer_profiles['Region'] + ' ' + customer_profiles['Category']

In [7]:
tfidf = TfidfVectorizer()
profile_vectors = tfidf.fit_transform(customer_profiles['ProfileText'])

In [8]:
# Compute similarity matrix

similarity_matrix = cosine_similarity(profile_vectors)

In [9]:
# Find top 3 lookalike customers for each of the first 20 customers

# Dictionary to store lookalike results for each customer
lookalike_map = {}

for i in range(20):  # For CustomerID: C0001 to C0020
    customer_id = customer_profiles.iloc[i]['CustomerID']  # Get the CustomerID of the current customer
    similarity_scores = list(enumerate(similarity_matrix[i])) # Get similarity scores for the current customer


    # Exclude the customer itself and sort other customers by similarity score in descending order
    similarity_scores = sorted(
        [(customer_profiles.iloc[j]['CustomerID'], score) for j, score in similarity_scores if j != i],
        key=lambda x: x[1], # Sort by the similarity score
        reverse=True # Descending order
    )

    # Store the top 3 most similar customers and their similarity scores in the dictionary
    lookalike_map[customer_id] = similarity_scores[:3]  # Top 3 lookalikes


In [10]:
# Save lookalike results to CSV

# List to store rows for the CSV
lookalike_data = []
for cust_id, lookalikes in lookalike_map.items(): # Loop through each customer and their lookalikes
    lookalike_data.append({
        'CustomerID': cust_id, # Current customer ID

         # List of tuples (lookalike customer ID, similarity score) rounded to 4 decimal places
        'Lookalikes': [(l_id, round(score, 4)) for l_id, score in lookalikes]
    })

# Convert the lookalike data to a DataFrame
lookalike_df = pd.DataFrame(lookalike_data)

# Save the DataFrame to a CSV file named 'Lookalike.csv'
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike Model Completed. Results saved to Lookalike.csv.")

Lookalike Model Completed. Results saved to Lookalike.csv.
