In [1]:
from collections import defaultdict

import json
import numpy as np
import pandas as pd
import pickle
import polars as pl

from transformers import LlamaModel, LlamaTokenizer

import torch
from torch.utils.data import DataLoader

from tqdm import tqdm as tqdm

In [2]:
interactions_dataset_path = '../data/Beauty/Beauty_5.json'
metadata_path = '../data/Beauty/metadata.json'

interactions_output_path = '../data/Beauty/inter_new.json'
embeddings_output_path = '../data/Beauty/content_embeddings.pkl'

In [3]:
df = defaultdict(list)

with open(interactions_dataset_path, 'r') as f:
    for line in f.readlines():
        review = json.loads(line)
        df['user_id'].append(review['reviewerID'])
        df['item_id'].append(review['asin'])
        df['timestamp'].append(review['unixReviewTime'])

print(f'Number of events: {len(df["user_id"])}')

df = pl.from_dict(df)

Number of events: 198502


In [4]:
df.head()

user_id,item_id,timestamp
str,str,i64
"""A1YJEY40YUW4SE""","""7806397051""",1391040000
"""A60XNB876KYML""","""7806397051""",1397779200
"""A3G6XNM240RMWA""","""7806397051""",1378425600
"""A1PQFP6SAJ6D80""","""7806397051""",1386460800
"""A38FVHZTNQ271F""","""7806397051""",1382140800


In [5]:
filtered_df = df.clone()

In [6]:
# Processing dataset to get core-5 state in case full dataset is provided
is_changed = True
threshold = 5
good_users = set()
good_items = set()

while is_changed:
    user_counts = filtered_df.group_by('user_id').agg(
        pl.len().alias('user_count'),
    )
    item_counts = filtered_df.group_by('item_id').agg(
        pl.len().alias('item_count'),
    )

    good_users = user_counts.filter(pl.col('user_count') >= threshold).select(
        'user_id',
    )
    good_items = item_counts.filter(pl.col('item_count') >= threshold).select(
        'item_id',
    )

    old_size = len(filtered_df)

    new_df = filtered_df.join(good_users, on='user_id', how='inner')
    new_df = new_df.join(good_items, on='item_id', how='inner')

    new_size = len(new_df)

    filtered_df = new_df
    is_changed = old_size != new_size


In [7]:
unique_values = filtered_df["user_id"].unique(maintain_order=True).to_list()
user_ids_mapping = {value: i for i, value in enumerate(unique_values)}

filtered_df = filtered_df.with_columns(
    pl.col("user_id").replace_strict(user_ids_mapping)
)

unique_values = filtered_df["item_id"].unique(maintain_order=True).to_list()
item_ids_mapping = {value: i for i, value in enumerate(unique_values)}

filtered_df = filtered_df.with_columns(
    pl.col("item_id").replace_strict(item_ids_mapping)
)

filtered_df.head()

user_id,item_id,timestamp
i64,i64,i64
0,0,1391040000
1,0,1397779200
2,0,1378425600
3,0,1386460800
4,0,1382140800


In [8]:
item_ids_mapping_df = pl.from_dict({
    'old_item_id': list(item_ids_mapping.keys()),
    'new_item_id': list(item_ids_mapping.values())
})
item_ids_mapping_df.head()

old_item_id,new_item_id
str,i64
"""7806397051""",0
"""9759091062""",1
"""9788072216""",2
"""9790790961""",3
"""9790794231""",4


In [20]:
filtered_df = filtered_df.sort(["user_id", "timestamp"])

grouped_filtered_df = filtered_df.group_by("user_id", maintain_order=True).agg(pl.all())

In [21]:
item_ids_mapping_df.head()

old_item_id,new_item_id
str,i64
"""7806397051""",0
"""9759091062""",1
"""9788072216""",2
"""9790790961""",3
"""9790794231""",4


In [22]:
grouped_filtered_df.head()

user_id,item_id,timestamp
i64,list[i64],list[i64]
0,"[6845, 7872, … 0]","[1318896000, 1318896000, … 1391040000]"
1,"[815, 10405, … 232]","[1392422400, 1396224000, … 1397779200]"
2,"[6049, 0, … 6608]","[1378425600, 1378425600, … 1400284800]"
3,"[5521, 5160, … 0]","[1379116800, 1380931200, … 1386460800]"
4,"[0, 10469, … 11389]","[1382140800, 1383523200, … 1388966400]"


In [23]:
print('Users count:', filtered_df.select('user_id').unique().shape[0])
print('Items count:', filtered_df.select('item_id').unique().shape[0])
print('Actions count:', filtered_df.shape[0])
print('Avg user history len:', np.mean(list(map(lambda x: x[0], grouped_filtered_df.select(pl.col('item_id').list.len()).rows()))))

Users count: 22363
Items count: 12101
Actions count: 198502
Avg user history len: 8.876358270357287


In [None]:
json_data = {}
for user_id, item_ids, _ in grouped_filtered_df.iter_rows():
    json_data[user_id] = item_ids

with open(interactions_output_path, 'w') as f:
    json.dump(json_data, f, indent=2)

## Content embedding creation

In [None]:
def getDF(path):
    i = 0
    df = {}
    with open(path, 'r') as f:
        for line in f.readlines():
            df[i] = eval(line)
            i += 1

    return pd.DataFrame.from_dict(df, orient="index")

df = getDF(metadata_path)
df.head()

In [None]:
def preprocess(row: pd.Series):
    row = row.fillna("None")
    return f"Title: {row['title']}. Categories: {', '.join(row['categories'][0])}. Description: {row['description']}."


def get_data(metadata_df, item_ids_mapping_df):
    filtered_df = metadata_df.join(
        item_ids_mapping_df, 
        left_on="asin", 
        right_on='old_item_id', 
        how="inner"
    ).select(pl.col('new_item_id'), pl.col('title'), pl.col('description'), pl.col('categories'))

    filtered_df = filtered_df.to_pandas()
    filtered_df["combined_text"] = filtered_df.apply(preprocess, axis=1)

    return filtered_df


In [None]:
data = get_data(pl.from_pandas(df), item_ids_mapping)

In [None]:
device = torch.device('cuda:6')

model_name = "huggyllama/llama-7b"
tokenizer = LlamaTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = LlamaModel.from_pretrained(model_name)
model = model.to(device)
model = model.eval()


class MyDataset:
    def __init__(self, data):
        self._data = list(zip(data.to_dict()['new_item_id'].values(), data.to_dict()['combined_text'].values()))

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        text = self._data[idx][1]
        inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True, padding="max_length")
        return {
            'item_id': self._data[idx][0],
            'input_ids': inputs['input_ids'][0],
            'attention_mask': inputs['attention_mask'][0]
        }
    

dataset = MyDataset(data)
loader = DataLoader(dataset, batch_size=8, drop_last=False, shuffle=False, num_workers=10)


new_df = {
    'item_id': [],
    'embedding': []
}

for batch in tqdm(loader):
    with torch.inference_mode():
        outputs = model(
            input_ids=batch["input_ids"].to(device), 
            attention_mask=batch["attention_mask"].to(device)
        )
        embeddings = outputs.last_hidden_state
    
        embeddings = outputs.last_hidden_state  # (bs, sl, ed)
        embeddings[(~batch["attention_mask"].bool())] = 0. # (bs, sl, ed)

    new_df['item_id'] += batch['item_id'].tolist()
    new_df['embedding'] += embeddings.mean(dim=1).tolist()  # (bs, ed)


with open(embeddings_output_path, 'wb') as f:
    pickle.dump(new_df, f)
