In [1]:
%pip install polars==1.33.1
%pip install numpy==2.3.3
%pip install pyarrow==17.0.0
%pip install pandas==2.2.3
%pip install transformers==4.56.1
%pip install sentencepiece==0.2.1

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --up

In [2]:
from collections import defaultdict

import json
import numpy as np
import pandas as pd
import pickle
import polars as pl

from transformers import LlamaModel, LlamaTokenizer

import torch
from torch.utils.data import DataLoader

from tqdm import tqdm as tqdm



In [3]:
interactions_dataset_path = '../data/Beauty/Beauty_5.json'
metadata_path = '../data/Beauty/metadata.json'

interactions_output_path = '../data/Beauty/inter_new.json'
embeddings_output_path = '../data/Beauty/content_embeddings.pkl'

In [4]:
df = defaultdict(list)

with open(interactions_dataset_path, 'r') as f:
    for line in f.readlines():
        review = json.loads(line)
        df['user_id'].append(review['reviewerID'])
        df['item_id'].append(review['asin'])
        df['timestamp'].append(review['unixReviewTime'])

print(f'Number of events: {len(df["user_id"])}')

df = pl.from_dict(df)

Number of events: 198502


In [5]:
df.head()

user_id,item_id,timestamp
str,str,i64
"""A1YJEY40YUW4SE""","""7806397051""",1391040000
"""A60XNB876KYML""","""7806397051""",1397779200
"""A3G6XNM240RMWA""","""7806397051""",1378425600
"""A1PQFP6SAJ6D80""","""7806397051""",1386460800
"""A38FVHZTNQ271F""","""7806397051""",1382140800


In [6]:
filtered_df = df.clone()

In [7]:
# Processing dataset to get core-5 state in case full dataset is provided
is_changed = True
threshold = 5
good_users = set()
good_items = set()

while is_changed:
    user_counts = filtered_df.group_by('user_id').agg(
        pl.len().alias('user_count'),
    )
    item_counts = filtered_df.group_by('item_id').agg(
        pl.len().alias('item_count'),
    )

    good_users = user_counts.filter(pl.col('user_count') >= threshold).select(
        'user_id',
    )
    good_items = item_counts.filter(pl.col('item_count') >= threshold).select(
        'item_id',
    )

    old_size = len(filtered_df)

    new_df = filtered_df.join(good_users, on='user_id', how='inner')
    new_df = new_df.join(good_items, on='item_id', how='inner')

    new_size = len(new_df)

    filtered_df = new_df
    is_changed = old_size != new_size


In [8]:
unique_values = filtered_df["user_id"].unique(maintain_order=True).to_list()
user_ids_mapping = {value: i for i, value in enumerate(unique_values)}

filtered_df = filtered_df.with_columns(
    pl.col("user_id").replace_strict(user_ids_mapping)
)

unique_values = filtered_df["item_id"].unique(maintain_order=True).to_list()
item_ids_mapping = {value: i for i, value in enumerate(unique_values)}

filtered_df = filtered_df.with_columns(
    pl.col("item_id").replace_strict(item_ids_mapping)
)

filtered_df.head()

user_id,item_id,timestamp
i64,i64,i64
0,0,1391040000
1,0,1397779200
2,0,1378425600
3,0,1386460800
4,0,1382140800


In [9]:
item_ids_mapping_df = pl.from_dict({
    'old_item_id': list(item_ids_mapping.keys()),
    'new_item_id': list(item_ids_mapping.values())
})
item_ids_mapping_df.head()

old_item_id,new_item_id
str,i64
"""7806397051""",0
"""9759091062""",1
"""9788072216""",2
"""9790790961""",3
"""9790794231""",4


In [10]:
filtered_df.head()

user_id,item_id,timestamp
i64,i64,i64
0,0,1391040000
1,0,1397779200
2,0,1378425600
3,0,1386460800
4,0,1382140800


In [11]:
filtered_df = filtered_df.sort(["user_id", "timestamp"])

grouped_filtered_df = filtered_df.group_by("user_id", maintain_order=True).agg(pl.all())

In [12]:
item_ids_mapping_df.head()

old_item_id,new_item_id
str,i64
"""7806397051""",0
"""9759091062""",1
"""9788072216""",2
"""9790790961""",3
"""9790794231""",4


In [13]:
grouped_filtered_df.head()

user_id,item_id,timestamp
i64,list[i64],list[i64]
0,"[6845, 7872, … 0]","[1318896000, 1318896000, … 1391040000]"
1,"[815, 10405, … 232]","[1392422400, 1396224000, … 1397779200]"
2,"[6049, 0, … 6608]","[1378425600, 1378425600, … 1400284800]"
3,"[5521, 5160, … 0]","[1379116800, 1380931200, … 1386460800]"
4,"[0, 10469, … 11389]","[1382140800, 1383523200, … 1388966400]"


In [14]:
print('Users count:', filtered_df.select('user_id').unique().shape[0])
print('Items count:', filtered_df.select('item_id').unique().shape[0])
print('Actions count:', filtered_df.shape[0])
print('Avg user history len:', np.mean(list(map(lambda x: x[0], grouped_filtered_df.select(pl.col('item_id').list.len()).rows()))))

Users count: 22363
Items count: 12101
Actions count: 198502
Avg user history len: 8.876358270357287


In [15]:
json_data = {}
for user_id, item_ids, _ in grouped_filtered_df.iter_rows():
    json_data[user_id] = item_ids

with open(interactions_output_path, 'w') as f:
    json.dump(json_data, f, indent=2)

## Content embedding creation

In [None]:
def getDF(path):
    i = 0
    df = {}
    with open(path, 'r') as f:
        for line in f.readlines():
            df[i] = eval(line)
            i += 1

    return pd.DataFrame.from_dict(df, orient="index")

df = getDF(metadata_path)
df.head()

IOStream.flush timed out


Unnamed: 0,asin,salesRank,imUrl,categories,title,description,price,related,brand
0,1048791,{'Books': 6334800},http://ecx.images-amazon.com/images/I/51MKP0T4...,[[Books]],"The Crucible: Performed by Stuart Pankin, Jero...",,,,
1,143561,{'Movies & TV': 376041},http://g-ecx.images-amazon.com/images/G/01/x-s...,"[[Movies & TV, Movies]]","Everyday Italian (with Giada de Laurentiis), V...","3Pack DVD set - Italian Classics, Parties and ...",12.99,"{'also_viewed': ['B0036FO6SI', 'B000KL8ODE', '...",
2,37214,{'Clothing': 1233557},http://ecx.images-amazon.com/images/I/31mCncNu...,"[[Clothing, Shoes & Jewelry, Girls], [Clothing...",Purple Sequin Tiny Dancer Tutu Ballet Dance Fa...,,6.99,"{'also_viewed': ['B00JO8II76', 'B00DGN4R1Q', '...",Big Dreams
3,32069,,http://ecx.images-amazon.com/images/I/51EzU6qu...,"[[Sports & Outdoors, Other Sports, Dance, Clot...",Adult Ballet Tutu Cheetah Pink,,7.89,"{'also_bought': ['0000032050', 'B00D0DJAEG', '...",BubuBibi
4,31909,{'Toys & Games': 201847},http://ecx.images-amazon.com/images/I/41xBoP0F...,"[[Sports & Outdoors, Other Sports, Dance]]",Girls Ballet Tutu Neon Pink,High quality 3 layer ballet tutu. 12 inches in...,7.0,"{'also_bought': ['B002BZX8Z6', 'B00JHONN1S', '...",Unknown


In [None]:
def preprocess(row: pd.Series):
    row = row.fillna("None")
    return f"Title: {row['title']}. Categories: {', '.join(row['categories'][0])}. Description: {row['description']}."


def get_data(metadata_df, item_ids_mapping_df):
    filtered_df = metadata_df.join(
        item_ids_mapping_df, 
        left_on="asin", 
        right_on='old_item_id', 
        how="inner"
    ).select(pl.col('new_item_id'), pl.col('title'), pl.col('description'), pl.col('categories'))

    filtered_df = filtered_df.to_pandas()
    filtered_df["combined_text"] = filtered_df.apply(preprocess, axis=1)

    return filtered_df


In [None]:
%pip list

Package                  Version
------------------------ -----------
absl-py                  2.3.1
boto3                    1.28.82
botocore                 1.31.85
certifi                  2025.8.3
charset-normalizer       3.4.3
cloudpickle              3.1.1
Cython                   3.1.4
debugpy                  1.8.17
filelock                 3.20.0
fsspec                   2025.9.0
grpcio                   1.75.1
hf-xet                   1.1.10
huggingface-hub          0.35.3
idna                     3.10
Jinja2                   3.1.6
jmespath                 1.0.1
Markdown                 3.9
MarkupSafe               3.0.3
matplotlib-inline        0.1.7
ml-kernel                0.1.0
mpmath                   1.3.0
murmurhash               1.0.13
networkx                 3.5
numpy                    2.3.3
nvidia-cublas-cu12       12.1.3.1
nvidia-cuda-cupti-cu12   12.1.105
nvidia-cuda-nvrtc-cu12   12.1.105
nvidia-cuda-runtime-cu12 12.1.105
nvidia-cudnn-cu12        9.1.0.70
nvidi

In [None]:
data = get_data(pl.from_pandas(df), item_ids_mapping_df)

In [23]:
device = torch.device('cuda:6')

model_name = "huggyllama/llama-7b"
tokenizer = LlamaTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = LlamaModel.from_pretrained(model_name)
model = model.to(device)
model = model.eval()

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 8f4e8ca7-751e-49d6-93ce-39f449356f4f)')' thrown while requesting HEAD https://huggingface.co/huggyllama/llama-7b/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


OSError: huggyllama/llama-7b does not appear to have a file named model-00001-of-00002.safetensors. Checkout 'https://huggingface.co/huggyllama/llama-7b/tree/main' for available files.

In [None]:
class MyDataset:
    def __init__(self, data):
        self._data = list(zip(data.to_dict()['new_item_id'].values(), data.to_dict()['combined_text'].values()))

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        text = self._data[idx][1]
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")
        return {
            'item_id': self._data[idx][0],
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0]
        }
    

dataset = MyDataset(data)
loader = DataLoader(dataset, batch_size=8, drop_last=False, shuffle=False, num_workers=10)


new_df = {
    'item_id': [],
    'embedding': []
}

for batch in tqdm(loader):
    with torch.inference_mode():
        outputs = model(
            input_ids=batch["input_ids"].to(device), 
            attention_mask=batch["attention_mask"].to(device)
        )
        embeddings = outputs.last_hidden_state
    
        embeddings = outputs.last_hidden_state  # (bs, sl, ed)
        embeddings[(~batch["attention_mask"].bool())] = 0. # (bs, sl, ed)

    new_df['item_id'] += batch['item_id'].tolist()
    new_df['embedding'] += embeddings.mean(dim=1).tolist()  # (bs, ed)


with open(embeddings_output_path, 'wb') as f:
    pickle.dump(new_df, f)
