In [None]:
!pip install -q tqdm
from tqdm import tqdm

In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
import json
import pandas as pd
import glob

import torch
from torch.utils.data import Dataset

import torch.nn as nn

from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
import numpy as np

from collections import defaultdict

from sklearn.feature_extraction.text import TfidfVectorizer

import faiss

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Step 1: Get file list
files = glob.glob ("/content/drive/MyDrive/amazon_cleaned_chunks/df10_user_history.json")

In [None]:
# Step 2: Parse and flatten
records = []
for file in files:
    with open(file, "r") as f:
        data = json.load(f)
        for reviewer_id, reviews in data.items():
            for review in reviews:
                if len(review) == 3:
                    asin, rating, text = review
                    records.append({
                        "reviewerID": reviewer_id,
                        "asin": asin,
                        "rating": rating,
                        "reviewText": text
                    })


In [None]:
# Step 3: Create DataFrame
df = pd.DataFrame(records)

In [None]:
# Step 4: Check output
print(df.shape)
print(df.columns.tolist())
df.head()

(6739590, 4)
['reviewerID', 'asin', 'rating', 'reviewText']


Unnamed: 0,reviewerID,asin,rating,reviewText
0,AAP7PPBU72QFM,0151004714,5,this is the best novel i have read in or years...
1,AAP7PPBU72QFM,B00005AXIV,5,i have really poor eyes ##ight and wear bi ##f...
2,AAP7PPBU72QFM,B00006B7TL,5,tried the system without this little ga ##dget...
3,AAP7PPBU72QFM,B00017IX10,5,sits on my desk next to my computer and phone ...
4,AAP7PPBU72QFM,B0002OZXHO,5,im new to this type of ga ##dget and am blown ...


In [None]:
# Save to Parquet format (efficient for future reads)
df.to_parquet("df10_user_history.parquet", index=False)

In [None]:
# Map reviewerID to user_id (integer)
user2id = {uid: i for i, uid in enumerate(df['reviewerID'].unique())}
item2id = {aid: i for i, aid in enumerate(df['asin'].unique())}

# Apply mappings
df['user_id'] = df['reviewerID'].map(user2id)
df['item_id'] = df['asin'].map(item2id)


In [None]:
# Keep Only Needed Columns
df_mf = df[['user_id', 'item_id', 'rating']]


In [None]:
# Define PyTorch Dataset
class AmazonDataset(Dataset):
    def __init__(self, dataframe):
        self.users = torch.tensor(dataframe['user_id'].values, dtype=torch.long)
        self.items = torch.tensor(dataframe['item_id'].values, dtype=torch.long)
        self.ratings = torch.tensor(dataframe['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

In [None]:
# Build Model (Deep Matrix Factorization)
class DeepMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64):
        super(DeepMF, self).__init__()
        self.user_emb = nn.Embedding(num_users, embedding_dim)
        self.item_emb = nn.Embedding(num_items, embedding_dim)

        self.mlp = nn.Sequential(
            nn.Linear(2 * embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user, item):
        user_vec = self.user_emb(user)
        item_vec = self.item_emb(item)
        x = torch.cat([user_vec, item_vec], dim=1)
        return self.mlp(x).squeeze()

In [None]:
## Add Validation Split (80/20)

# Split df into train/val
df_train, df_val = train_test_split(df_mf, test_size=0.2, random_state=42)

# Create Datasets
train_dataset = AmazonDataset(df_train)
val_dataset = AmazonDataset(df_val)

In [None]:
## Add DataLoaders

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)

### Deep Matrix Factorization model Training

Hyperparams
1. embedding_dim = 64
2. batch_size = 1024
3. num_epochs = 5
4. lr = 1e-3

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for user, item, rating in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        user, item, rating = user.to('cuda'), item.to('cuda'), rating.to('cuda')

        optimizer.zero_grad()
        preds = model(user, item)
        loss = criterion(preds, rating)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation loss
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for user, item, rating in val_loader:
            user, item, rating = user.to('cuda'), item.to('cuda'), rating.to('cuda')
            preds = model(user, item)
            loss = criterion(preds, rating)
            val_loss += loss.item()

    print(f"Epoch {epoch+1} → Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

Epoch 1/5: 100%|██████████| 5266/5266 [03:05<00:00, 28.32it/s]


Epoch 1 → Train Loss: 6731.8387 | Val Loss: 1630.6191


Epoch 2/5: 100%|██████████| 5266/5266 [03:04<00:00, 28.60it/s]


Epoch 2 → Train Loss: 5877.5012 | Val Loss: 1592.8761


Epoch 3/5: 100%|██████████| 5266/5266 [03:07<00:00, 28.14it/s]


Epoch 3 → Train Loss: 5215.8541 | Val Loss: 1607.5749


Epoch 4/5: 100%|██████████| 5266/5266 [03:09<00:00, 27.82it/s]


Epoch 4 → Train Loss: 4616.6701 | Val Loss: 1607.3322


Epoch 5/5: 100%|██████████| 5266/5266 [03:09<00:00, 27.75it/s]


Epoch 5 → Train Loss: 4119.4323 | Val Loss: 1652.8468


We will extract the learned user embeddings from the trained Deep Matrix Factorization model. These embeddings capture personalized preferences (based on past interactions), and will be used in Phase 5 to tailor recommendations or prompts.

This allows the generator to say things like:
Once this is done, you’ll have a ready-to-use persona vector for every user to inject into prompts in the generation phase.

In [None]:
## Extract and Save User Embeddings

# Get user embeddings from the trained model
user_embeddings = model.user_emb.weight.detach().cpu().numpy()

# Map back to reviewerIDs
id2user = {v: k for k, v in user2id.items()}

# Create dictionary: {reviewerID: embedding_vector}
user_embedding_dict = {
    id2user[i]: user_embeddings[i].tolist()
    for i in range(len(user_embeddings))
}


In [None]:
with open("user_embeddings.json", "w") as f:
    json.dump(user_embedding_dict, f)

### Build the FAISS Index (from user embeddings)

In [None]:
# Convert to float32 (required by FAISS)
user_matrix = np.array(list(user_embedding_dict.values()), dtype='float32')

# Normalize for cosine similarity
faiss.normalize_L2(user_matrix)

# Build FAISS index using Inner Product (for cosine similarity)
user_index = faiss.IndexFlatIP(user_matrix.shape[1])
user_index.add(user_matrix)

# Keep mapping of FAISS index → reviewerID
user_id_list = list(user_embedding_dict.keys())

In [None]:
#  Query Similar Users

def get_similar_users(user_vec, top_k=5):
    vec = np.array(user_vec, dtype='float32').reshape(1, -1)
    faiss.normalize_L2(vec)
    D, I = user_index.search(vec, top_k)
    return [(user_id_list[i], D[0][j]) for j, i in enumerate(I[0])]


In [None]:
# Example
user_vec = user_embedding_dict["AAP7PPBU72QFM"]
similar_users = get_similar_users(user_vec, top_k=5)
print(similar_users)


[('AAP7PPBU72QFM', np.float32(1.0)), ('A1J9TY17XBME4Z', np.float32(0.60383415)), ('A190YLEZ53BK5I', np.float32(0.5407624)), ('ARPEZ3KXLFK8K', np.float32(0.53266597)), ('AECX92ODFJRO4', np.float32(0.50579137))]


In [None]:
def get_fallback_persona(user_id):
    user_vec = user_embedding_dict[user_id]
    similar = get_similar_users(user_vec, top_k=5)[1:]  # skip self
    for sim_user, score in similar:
        if sim_user in user_cluster_map:
            cluster_id = user_cluster_map[sim_user]
            return cluster_to_label[cluster_id]
    return "This user has no similar profile available yet."

In [None]:
prompt = f"""{fallback_persona}
Candidate: Kindle Paperwhite
Tags: e-reader, glare-free, waterproof
Write one line on why the user might enjoy this product:
"""

output = pipe(prompt, max_new_tokens=50)[0]["generated_text"]
print(output)


This user is interested in head, got, love.
Candidate: Kindle Paperwhite
Tags: e-reader, glare-free, waterproof
Write one line on why the user might enjoy this product:

The Kindle Paperwhite is a perfect choice for someone who loves to read head over heels, but also finds themselves in situations where glare is a problem or they may need to read in the presence of water.

Candidate: Nintendo


### Cluster User Embeddings (powerful)
This helps generate generalized personas if you don’t want one prompt per user.

In [None]:
# Load embeddings
X = np.array(list(user_embedding_dict.values()))

# Cluster into, say, 20 user types
kmeans = KMeans(n_clusters=20, random_state=42)
cluster_labels = kmeans.fit_predict(X)

Generate Persona Labels - Assign soft descriptors to clusters (manual or automatic using TF-IDF later).

In [None]:
## Build a user → reviews dictionary
user_reviews = defaultdict(list)
for _, row in df.iterrows():
    user_reviews[row['reviewerID']].append(row['reviewText'])

In [None]:
## Group users by cluster and gather reviews

user_cluster_map = {
    user: int(cluster_labels[i])
    for i, user in enumerate(user_embedding_dict.keys())
}

cluster_reviews = defaultdict(list)

for user, cluster_id in user_cluster_map.items():
    texts = user_reviews.get(user, [])
    cluster_reviews[cluster_id].extend(texts)

In [None]:
## TF-IDF and Top Terms per Cluster

cluster_to_label = {}

for cluster_id, texts in cluster_reviews.items():
    combined_text = " ".join(texts)

    custom_stopwords = set(["great", "good", "just", "use", "works", "really", "very", "thing", "product", "get"])

    vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=50,
    lowercase=True,
    token_pattern=r"(?u)\b\w\w+\b"
    )

    X = vectorizer.fit_transform([combined_text])
    tfidf_scores = X.toarray().flatten()
    terms = vectorizer.get_feature_names_out()

    # Filter out weak/generic terms
    terms = [t for t in terms if t not in custom_stopwords]

    # Get top 5 meaningful terms
    top_indices = np.argsort(tfidf_scores)[::-1][:5]
    top_words = [terms[i] for i in top_indices if i < len(terms)]

    label = f"This user is interested in {', '.join(top_words)}."
    cluster_to_label[cluster_id] = label

In [None]:
cluster_to_label

{2: 'This user is interested in head, got, love.',
 1: 'This user is interested in head, got, love.',
 10: 'This user is interested in head, got, love.',
 3: 'This user is interested in head, got, love.',
 6: 'This user is interested in head, got, love.',
 0: 'This user is interested in head, got, love.',
 19: 'This user is interested in head, got, love.',
 13: 'This user is interested in head, got, love.',
 14: 'This user is interested in head, got, little.',
 12: 'This user is interested in head, got, little.',
 17: 'This user is interested in head, got, love.',
 15: 'This user is interested in head, got, love.',
 9: 'This user is interested in head, got, love.',
 7: 'This user is interested in head, got, little.',
 11: 'This user is interested in head, got, little.',
 18: 'This user is interested in head, got, little.',
 8: 'This user is interested in head, got, little.',
 4: 'This user is interested in head, got, little.',
 5: 'This user is interested in head, got, little.',
 16: '

In [None]:
import json

with open("user_cluster_map.json", "w") as f:
    json.dump(user_cluster_map, f)

with open("cluster_to_label.json", "w") as f:
    json.dump(cluster_to_label, f)


### Inject Cluster-Based Persona into Prompt

In [None]:
user_id = "AAP7PPBU72QFM"  # example
cluster_id = user_cluster_map[user_id]
persona = cluster_to_label[cluster_id]

In [None]:
# Example product
candidate = "Sony WH-1000XM4"
tags = "noise-canceling, over-ear, wireless"

In [None]:
# Prompt for generation
prompt = f"""{persona}
Candidate: {candidate}
Tags: {tags}
Write one line on why the user might enjoy this product:"""

In [None]:
from transformers import pipeline

pipe = pipeline(
    "text-generation",
    model="openchat/openchat-3.5-0106",
    device=0  # use 0 if running on GPU
)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Device set to use cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 22.16 GiB of which 21.38 MiB is free. Process 10527 has 22.13 GiB memory in use. Of the allocated memory 21.69 GiB is allocated by PyTorch, and 212.54 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Generate Example
output = pipe(prompt, max_new_tokens=50)[0]["generated_text"]
print(output)

This user is interested in head, got, love.
Candidate: Sony WH-1000XM4
Tags: noise-canceling, over-ear, wireless
Write one line on why the user might enjoy this product: The Sony WH-1000XM4 headphones offer excellent noise-canceling, long battery life, and a comfortable fit, making them perfect for users who love music and travel.

This user is interested in head, got
