In [7]:
JSONL_PATH = '/Users/jakubmalczak/UNI/INŻ/SequentialRecommendation/recbole/data/dataset/Amazon_Sports_and_Outdoors/meta_Sports_and_Outdoors_2023.jsonl'
SAVE_PATH = '/Users/jakubmalczak/UNI/INŻ/SequentialRecommendation/recbole/data/dataset/Amazon_Sports_and_Outdoors/title_embeddings_2023.parquet'

### Reading JSONL file

In [2]:
import pandas as pd
import json

data = []
with open(JSONL_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        record = json.loads(line)
        data.append(
            {
                "item_id": record.get("parent_asin"),
                "title": record.get("title"),
            }
        )
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,item_id,title
0,B01HDXC8AG,Sure-Grip Zombie Wheels Low 59mm 4 Pack
1,B07R5BQ4YD,USGI Wet Weather Bag (Fоur Paсk)
2,B003K8GZ7G,NHL San Jose Sharks Team Logo Post Earrings
3,B08GC4GBWB,Bont Skates - Prostar Purple Suede Professiona...
4,B07BYV947H,Team Golf Alamaba Crimson Tide Embroidered Tow...


### Generate embeddings

In [3]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

titles = df['title'].astype('str').tolist()
model = SentenceTransformer('all-MiniLM-L6-v2')
batch_size = 512
embeddings = []
for i in tqdm(range(0, len(titles), batch_size)):
    batch = titles[i:i + batch_size]
    emb = model.encode(batch, show_progress_bar=False)
    embeddings.extend(emb)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 3101/3101 [16:55<00:00,  3.05it/s]


### Reducing dimensions to 64

In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=64)
embeddings_64 = pca.fit_transform(embeddings)
print(pca.explained_variance_ratio_.sum())

0.6400746254701076


### Reducing dimensions to 128

In [5]:
pca = PCA(n_components=128)
embeddings_128 = pca.fit_transform(embeddings)
print(pca.explained_variance_ratio_.sum())

0.8245454584608822


### Saving embeddings to parquet

In [8]:
df = df[['item_id']]
df['embedding_384'] = embeddings
df['embedding_128'] = embeddings_128.tolist()
df['embedding_64'] = embeddings_64.tolist()
df.to_parquet(SAVE_PATH)