In [1]:
TSV_PATH = '/recbole/data/dataset/Amazon_Sports_and_Outdoors/Amazon_Sports_and_Outdoors_2014.item'
SAVE_PATH = '/Users/jakubmalczak/UNI/INŻ/SequentialRecommendation/recbole/data/dataset/Amazon_Sports_and_Outdoors/title_embeddings_2014.parquet'

### Reading TSV file

In [2]:
import pandas as pd

df = pd.read_csv(TSV_PATH, delimiter='\t')
df = df[['item_id:token', 'title:token']]
df.rename(columns={'item_id:token': 'item_id', 'title:token': 'title'}, inplace=True)
df.head(5)

Unnamed: 0,item_id,title
0,32069,Adult Ballet Tutu Cheetah Pink
1,31909,Girls Ballet Tutu Neon Pink
2,32034,Adult Ballet Tutu Yellow
3,31852,Girls Ballet Tutu Zebra Hot Pink
4,32050,Adult Ballet Tutu Purple


### Generate embeddings

In [3]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

titles = df['title'].astype('str').tolist()
model = SentenceTransformer('all-MiniLM-L6-v2')
batch_size = 512
embeddings = []
for i in tqdm(range(0, len(titles), batch_size)):
    batch = titles[i:i + batch_size]
    emb = model.encode(batch, show_progress_bar=False)
    embeddings.extend(emb)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1040/1040 [04:24<00:00,  3.93it/s]


### Reducing dimensions to 64

In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=64)
embeddings_64 = pca.fit_transform(embeddings)
print(pca.explained_variance_ratio_.sum())

0.6277926926421573


### Reducing dimensions to 128

In [5]:
pca = PCA(n_components=128)
embeddings_128 = pca.fit_transform(embeddings)
print(pca.explained_variance_ratio_.sum())

0.8215115151153491


### Saving embeddings to parquet

In [9]:
df = df[['item_id']]
df['embedding_384'] = embeddings
df['embedding_128'] = embeddings_128.tolist()
df['embedding_64'] = embeddings_64.tolist()
df.to_parquet(SAVE_PATH)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['embedding_384'] = embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['embedding_128'] = embeddings_128.tolist()
