In [1]:
# Load the mergerd.csv file to begin EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
data = pd.read_csv('mergerd.csv', encoding='UTF-8-SIG')

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

customer_features = data[['CustomerID', 'Region', 'Category', 'TotalValue', 'Quantity']]
# Select the first 20 customers based on their Customer IDs
first_20_customers = customer_features[customer_features['CustomerID'].isin(
    ["C0001", "C0002", "C0003", "C0004", "C0005", "C0006", "C0007", "C0008", "C0009", "C0010",
     "C0011", "C0012", "C0013", "C0014", "C0015", "C0016", "C0017", "C0018", "C0019", "C0020"]
)]

# Create feature matrix from customer_features DataFrame for cosine similarity calculation
# Selecting only the numerical features for similarity calculation
feature_matrix = first_20_customers[['TotalValue', 'Quantity']]

# Compute the cosine similarity between customers
similarity_matrix = cosine_similarity(feature_matrix)

# Generate a mapping of each customer to their top 3 most similar customers (excluding themselves)
lookalike_map = {}
# Iterate through the customers in first_20_customers instead of customer_features
for idx, customer_id in enumerate(first_20_customers['CustomerID']):
    # Retrieve similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # Sort scores in descending order, excluding the current customer
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Extract the top 3 most similar customers and their scores
    # Use first_20_customers.iloc to get the CustomerID
    top_3 = [(first_20_customers.iloc[i]['CustomerID'], score) for i, score in sorted_scores[1:4]]
    lookalike_map[customer_id] = top_3

# Export the lookalike mapping to a CSV file
import csv

with open('Lookalike.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['CustomerID', 'Lookalikes'])
    for cust_id, lookalikes in lookalike_map.items():
        writer.writerow([cust_id, lookalikes])

print("Lookalike.csv has been created with the top 3 similar customers for the first 20 customers.")

Lookalike.csv has been created with the top 3 similar customers for the first 20 customers.


In [9]:
from sklearn.preprocessing import MinMaxScaler

# Normalize the feature matrix to improve similarity calculations
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(feature_matrix)

# Recalculate cosine similarity using the normalized features
improved_similarity_matrix = cosine_similarity(normalized_features)

# Create a mapping of each customer to their top 3 most similar customers (with scores) for all customers
improved_lookalike_map = {}
# Change the loop to iterate over the customers used to create the similarity matrix
for idx, customer_id in enumerate(first_20_customers['CustomerID']):  # Use first_20_customers instead of customer_features
    # Retrieve similarity scores for the current customer
    similarity_scores = list(enumerate(improved_similarity_matrix[idx]))
    # Sort the scores in descending order, excluding the current customer
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # Extract the top 3 most similar customers and their similarity scores
    # Use first_20_customers.iloc to get the CustomerID
    top_3 = [(first_20_customers.iloc[i]['CustomerID'], score) for i, score in sorted_scores[1:4]]
    improved_lookalike_map[customer_id] = top_3

# Save the improved lookalike mapping to a new CSV file
with open('Improved_Lookalike.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['CustomerID', 'Lookalikes'])
    for cust_id, lookalikes in improved_lookalike_map.items():
        writer.writerow([cust_id, lookalikes])

# Confirm the new file creation
print("Improved_Lookalike.csv has been created with the top 3 similar customers for all customers.")

Improved_Lookalike.csv has been created with the top 3 similar customers for all customers.
