In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

# Step 1: Load data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Convert dates to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge datasets
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [2]:
# Step 2: Feature Engineering
customer_features = data.groupby("CustomerID").agg(
    total_spend=("TotalValue", "sum"),
    avg_order_value=("TotalValue", "mean"),
    total_quantity=("Quantity", "sum"),
    unique_categories=("Category", lambda x: x.nunique())
).reset_index()

# Normalize customer features
scaler = MinMaxScaler()
normalized_features = pd.DataFrame(
    scaler.fit_transform(customer_features.iloc[:, 1:]),
    columns=customer_features.columns[1:],
    index=customer_features["CustomerID"]
)

# Combine normalized features with customer information
customer_data = customers.set_index("CustomerID").join(normalized_features)

# Generate textual descriptions for embedding
customer_data['profile_text'] = (
    "Region: " + customer_data['Region'] +
    ". Total spend: " + customer_data['total_spend'].astype(str) +
    ". Avg order value: " + customer_data['avg_order_value'].astype(str) +
    ". Total quantity: " + customer_data['total_quantity'].astype(str) +
    ". Unique categories: " + customer_data['unique_categories'].astype(str)
)

In [3]:
# Step 3: Generate Embeddings
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
customer_data['embeddings'] = customer_data['profile_text'].apply(lambda x: model.encode(x))

# Step 4: Dimensionality Reduction
embeddings_matrix = np.vstack(customer_data['embeddings'].values)
pca = PCA(n_components=50)  # Reduce dimensions for faster similarity calculations
reduced_embeddings = pca.fit_transform(embeddings_matrix)

# Step 5: Similarity Computation
similarity_matrix = cosine_similarity(reduced_embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
# Step 6: Recommendation System
top_lookalikes = {}
for i, cust_id in enumerate(customer_data.index[:20]):  # First 20 customers
    similarities = list(enumerate(similarity_matrix[i]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)[1:4]  # Top 3, exclude self
    top_lookalikes[cust_id] = [
        (customer_data.index[sim[0]], round(sim[1], 3)) for sim in similarities
    ]

# Step 7: Export Results
lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalikes": lookalikes}
    for cust_id, lookalikes in top_lookalikes.items()
])
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv", index=False)