In [4]:
TSV_PATH = '/Users/jakubmalczak/UNI/INŻ/SequentialRecommendation/recbole/data/dataset/Amazon_Sports_and_Outdoors/Amazon_Sports_and_Outdoors.item'
SAVE_PATH = '/Users/jakubmalczak/UNI/INŻ/SequentialRecommendation/recbole/data/dataset/Amazon_Sports_and_Outdoors/Amazon_Sports_and_Outdoors.ent'

### Reading TSV file

In [6]:
import pandas as pd

df = pd.read_csv(TSV_PATH, delimiter='\t')
df = df[['item_id:token', 'title:token']]
df.head(5)

Unnamed: 0,item_id:token,title:token
0,884509,Sure-Grip Zombie Wheels Low 59mm 4 Pack
1,561856,USGI Wet Weather Bag (Fоur Paсk)
2,239749,NHL San Jose Sharks Team Logo Post Earrings
3,55030,Bont Skates - Prostar Purple Suede Professiona...
4,1277121,Team Golf Alamaba Crimson Tide Embroidered Tow...


### Generate embeddings

In [7]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

titles = df['title:token'].astype('str').tolist()
model = SentenceTransformer('all-MiniLM-L6-v2')
batch_size = 512
embeddings = []
for i in tqdm(range(0, len(titles), batch_size)):
    batch = titles[i:i + batch_size]
    emb = model.encode(batch, show_progress_bar=False)
    embeddings.extend(emb)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 3101/3101 [17:01<00:00,  3.04it/s]


### Reducing dimensions to 128

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=128)
embeddings_128 = pca.fit_transform(embeddings)
print(pca.explained_variance_ratio_.sum())

0.824569816402736


### Adding embeddings to pandas dataframe

In [9]:
df = df[['item_id:token']]
df.rename(columns={'item_id:token' : 'ent_id:token'}, inplace=True)
df['ent_emb:float_seq'] = [' '.join(map(str, vec)) for vec in embeddings_128]
df.head(5)

Unnamed: 0,ent_id:token,ent_emb:float_seq
0,884509,-0.18853039269413344 0.0677280029458144 0.0227...
1,561856,-0.10877202087920926 -0.19504738371167724 -0.2...
2,239749,0.36511265353420264 0.13893028611081343 -0.135...
3,55030,-0.04588572487050822 -0.18943352195089447 0.13...
4,1277121,0.31423742583011693 -0.005561311444394548 -0.0...


### Saving .ent file

In [10]:
df.to_csv(SAVE_PATH, sep="\t", index=False)