# Lookalike Model Notebook

This notebook implements the lookalike model to identify similar customers.

In [1]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
merged = pd.merge(customers, transactions, on="CustomerID")

# Feature Engineering
customer_features = merged.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "Region": "first",
    "ProductID": lambda x: x.mode()[0]
}).reset_index()

# Encode categorical features
le = LabelEncoder()
customer_features["Region"] = le.fit_transform(customer_features["Region"])
customer_features["ProductID"] = le.fit_transform(customer_features["ProductID"])

# Normalize numeric features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Find top 3 similar customers for the first 20 customers
lookalike_data = {}
for idx in range(20):
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]
    similar_scores = similarity_matrix[idx][similar_indices]
    lookalike_data[customer_features.iloc[idx, 0]] = [
        (customer_features.iloc[i, 0], round(similar_scores[pos], 2))
        for pos, i in enumerate(similar_indices)
    ]

# Create Lookalike.csv
lookalike_df = pd.DataFrame([
    {"CustomerID": k, "Recommendations": v} for k, v in lookalike_data.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model completed and Lookalike.csv generated.")


Lookalike model completed and Lookalike.csv generated.


In [2]:
from sklearn.metrics.pairwise import cosine_similarity

# Feature engineering
customer_transactions = transactions.groupby('CustomerID').agg({'TotalValue': 'sum'}).reset_index()

# Merging customer data
customer_data = pd.merge(customers, customer_transactions, on="CustomerID")

# Calculate similarity
similarity_matrix = cosine_similarity(customer_data[['TotalValue']])
print(similarity_matrix[:5])

[[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
merged = pd.merge(customers, transactions, on="CustomerID")

# Feature Engineering
customer_features = merged.groupby("CustomerID").agg({
    "TotalValue": "sum",
    "Quantity": "sum",
    "Region": "first",
    "ProductID": lambda x: x.mode()[0]  # Most frequently purchased product
}).reset_index()

# Encode categorical features
le = LabelEncoder()
customer_features["Region"] = le.fit_transform(customer_features["Region"])
customer_features["ProductID"] = le.fit_transform(customer_features["ProductID"])

# Normalize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Find top 3 similar customers for the first 20 customers
lookalike_data = {}
for idx in range(20):
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]
    similar_scores = similarity_matrix[idx][similar_indices]
    lookalike_data[customer_features.iloc[idx, 0]] = [
        (customer_features.iloc[i, 0], round(similar_scores[pos], 2))
        for pos, i in enumerate(similar_indices)
    ]

# Create Lookalike.csv
lookalike_df = pd.DataFrame([
    {"CustomerID": k, "Recommendations": v} for k, v in lookalike_data.items()
])
lookalike_df.to_csv("Lookalike.csv", index=False)

print("Lookalike model completed and Lookalike.csv generated.")

Lookalike model completed and Lookalike.csv generated.
