# **Training Data**

In [1]:
!pip install pandas sentence-transformers pandarallel faiss-cpu

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting dill>=0.3.1 (from pandarallel)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64

In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

  from tqdm.autonotebook import tqdm, trange


In [3]:
data = [['I love the ambiance of this place!', 'positive'],
        ['The service was terrible and slow.', 'negative'],
        ['The food was absolutely delicious!', 'positive'],
        ['I wouldn’t recommend this restaurant to anyone.', 'negative'],
        ['The staff were very friendly and helpful.', 'positive'],
        ['The product quality is not worth the price.', 'negative'],
        ['I’m extremely satisfied with my purchase!', 'positive'],
        ['The packaging was damaged when it arrived.', 'negative'],
        ['This is the best experience I’ve ever had!', 'positive'],
        ['The software keeps crashing and is very frustrating.', 'negative']]

df_label = pd.DataFrame(data, columns=['text', 'category'])

In [4]:
data = [['I love the ambiance of this place!'],
        ['The service was terrible and slow.'],
        ['The food was absolutely delicious!'],
        ['I wouldn’t recommend this restaurant to anyone.'],
        ['The staff were very friendly and helpful.'],
        ['The product quality is not worth the price.'],
        ['I’m extremely satisfied with my purchase!'],
        ['The packaging was damaged when it arrived.'],
        ['This is the best experience I’ve ever had!'],
        ['The software keeps crashing and is very frustrating.']]

df = pd.DataFrame(data, columns=['text'])

In [5]:
df

Unnamed: 0,text
0,I love the ambiance of this place!
1,The service was terrible and slow.
2,The food was absolutely delicious!
3,I wouldn’t recommend this restaurant to anyone.
4,The staff were very friendly and helpful.
5,The product quality is not worth the price.
6,I’m extremely satisfied with my purchase!
7,The packaging was damaged when it arrived.
8,This is the best experience I’ve ever had!
9,The software keeps crashing and is very frustr...


In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [7]:
def generate_embeddings(query):
  embeddings = model.encode(query)
  return embeddings

In [8]:
# Generate embeddings for all the products descriptions - approx 3 min to complete

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=8)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [9]:

# Generate Embeddings for all the products
df['Text_Embedding'] = df['text'].parallel_apply(generate_embeddings)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

  self.pid = os.fork()


In [10]:
df.head()

Unnamed: 0,text,Text_Embedding
0,I love the ambiance of this place!,"[0.031904228, 0.03573211, 0.07044881, 0.020851..."
1,The service was terrible and slow.,"[0.040091746, 0.0659518, 0.030180737, -0.00408..."
2,The food was absolutely delicious!,"[-0.055612817, 0.0837042, 0.014392052, 0.04984..."
3,I wouldn’t recommend this restaurant to anyone.,"[-0.0009020012, 0.017153656, 0.032585483, 0.04..."
4,The staff were very friendly and helpful.,"[-0.026263343, 0.027818415, 0.010420542, 0.004..."


In [11]:
import faiss                   # make faiss available

In [12]:
ncentroids = 2
niter = 30
verbose = True
d = 384
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
kmeans.train(df['Text_Embedding'].to_list())

6.070149898529053

In [13]:
print(kmeans.centroids.shape)
print(kmeans.centroids[0])

(2, 384)
[-4.84317681e-03  1.24052698e-02  2.90986057e-02 -2.03665365e-02
 -3.58103812e-02 -3.23633328e-02 -3.91662344e-02  9.72917490e-03
 -1.15435887e-02 -2.88396608e-03  1.01477467e-02  4.91911173e-02
  9.52033047e-03  1.55684203e-02 -2.66565010e-02 -2.51307357e-02
  8.28880519e-02 -4.25428823e-02  1.44768124e-02  1.99353416e-03
 -2.62457058e-02 -1.11181661e-02 -1.56391952e-02  3.27581502e-02
 -2.50892453e-02 -1.33982990e-02 -2.60737389e-02  3.44852451e-04
 -1.48956701e-02 -3.67165916e-02 -1.46787036e-02  2.20413674e-02
 -4.14152537e-03 -1.26406606e-02  2.41413284e-02 -9.80912801e-03
  3.33294943e-02 -2.36754045e-02 -2.62889080e-02  2.56552435e-02
  1.45991798e-03  2.33527049e-02 -1.37302754e-02 -1.69169847e-02
  1.95294134e-02 -5.37581109e-02 -3.02131381e-03 -2.71758456e-02
  8.97781998e-02 -3.76443379e-04 -1.66797712e-02 -3.44264880e-02
  2.38980129e-02 -4.92203683e-02  1.22691980e-02  6.24720473e-03
 -3.97338569e-02  3.73760760e-02  1.76548529e-02  7.05070645e-02
  1.65554918e-02

In [14]:
D, I = kmeans.index.search(np.array(df['Text_Embedding'].to_list()), 1)

In [15]:
D,I

(array([[0.54259324],
        [0.53530073],
        [0.46643212],
        [0.5868243 ],
        [0.71821046],
        [0.54453194],
        [0.59245384],
        [0.7319902 ],
        [0.66975343],
        [0.6820594 ]], dtype=float32),
 array([[1],
        [0],
        [1],
        [0],
        [1],
        [0],
        [1],
        [1],
        [1],
        [0]]))

In [16]:
df_label

Unnamed: 0,text,category
0,I love the ambiance of this place!,positive
1,The service was terrible and slow.,negative
2,The food was absolutely delicious!,positive
3,I wouldn’t recommend this restaurant to anyone.,negative
4,The staff were very friendly and helpful.,positive
5,The product quality is not worth the price.,negative
6,I’m extremely satisfied with my purchase!,positive
7,The packaging was damaged when it arrived.,negative
8,This is the best experience I’ve ever had!,positive
9,The software keeps crashing and is very frustr...,negative
