## Imports

In [1]:
! pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [2]:
import spacy
import torch
import tqdm
import re
import json
import nltk
import pandas as pd

from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import quantize_embeddings
from sentence_transformers.util import cos_sim

from scipy.spatial.distance import cosine

from nltk.corpus import wordnet as wn
from tqdm import tqdm

nlp = spacy.load('en_core_web_sm')

2024-06-14 16:29:21.155488: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-14 16:29:21.155600: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-14 16:29:21.308525: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/ 

Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/wordnet/noun.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/verb.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/README  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.sense  
  inflating: /usr/share/nltk_data

## Helping Functions

In [4]:
def check_if_noun(sentence):
    singular = []
    plural = []
    # expressions containing nouns that should not be substituted
    exceptions = ['a group of', 'a couple of', 'group of', 'couple of', 'many',
                  'several', 'a lot of', 'lots of', 'others', 'other']
    expressions_to_pass = ['in front of', 'next to']
    txt = sentence.lower()
    for e in (exceptions+expressions_to_pass):
        txt = txt.replace(e, '')

    txt = ' '.join(txt.split())      # remove multiple whitespaces
    doc = nlp(txt)
    for token in doc:
        if len(token) > 1:
            if token.tag_ == 'NN':
                singular.append(token.text)
            elif token.tag_ == 'NNS':
                plural.append(token.text)
    return singular, plural, txt

In [5]:
def check_if_verb(txt):
    vbp = []
    vbg = []
    vb = []
    txt = txt.lower()
    txt = txt.strip()
    doc = nlp(txt)
    for token in doc:
        if token.dep_ != 'aux':         # except auxiliary verbs
            if token.tag_ == 'VBP':
                vbp.append(token.text)
            elif token.tag_ == 'VBG':
                vbg.append(token.text)
            elif token.tag_ == 'VB':
                vb.append(token.text)

    return vbp, vbg, vb, txt

In [6]:
def check_if_attribute(sentence):
    # expressions containing nouns that should not be substituted
    txt = sentence.lower()
    txt = ' '.join(txt.split())      # remove multiple whitespaces
    doc = nlp(txt)
    attr_list = []
    for token in doc:
        if token.pos_ == 'ADJ':
            attr_list.append(token.text)
    return attr_list, txt

In [7]:
def get_synsets(syn, pos, return_index=False):
    all_syn = []
    indices = []
    d = dict()
    p = pos[0] if pos is not None else None
    for idx, i in enumerate(syn):
        if wn.synsets(i, pos=p):
            s = wn.synsets(i, pos=p)[0]
            all_syn.append(s)
            d[s] = i
            indices.append(idx)
    if return_index:
        return all_syn, d, indices
    else:
        return all_syn, d

In [8]:
def get_antonym(given_word):
    antonyms = []
    for syn in wn.synsets(given_word):
        for lem in syn.lemmas():
            if lem.antonyms() and lem.antonyms()[0].synset().pos() == lem.synset().pos():
                antonyms.append(lem.antonyms()[0].name())
    return list(set(antonyms))   # remove duplicates and return the list

In [9]:
def get_antonym_list(words):
    ant_list = []
    for w in words:
        ant_list.extend(get_antonym(w))
    return ant_list

In [10]:
def create_attributes_list(sentences):
    all_attributes = []
    for s in sentences:
        attribute, new_s = check_if_attribute(s)
        all_attributes.append(attribute)

    attributes = [item for sublist in all_attributes for item in sublist]
    attributes = list(set(attributes))
    attributes = [word.replace('\\n', '') for word in attributes]
    attributes = [word.replace('\\', '') for word in attributes]

    return attributes

In [11]:
def create_word_list(sentences):
    all_words = []
    for s in sentences:
        txt = s.lower()
        txt = ' '.join(txt.split())      # remove multiple whitespaces
        doc = nlp(txt)
        all_words.extend([token.text for token in doc])
    
    return all_words
        

In [12]:
def create_verb_list(sentences):
    all_vbp = []
    all_vbg = []
    all_vb = []

    for s in sentences:
        vbp, vbg, vb, new_s = check_if_verb(s)
        all_vbp.append(vbp)
        all_vbg.append(vbg)
        all_vb.append(vb)

    vbp = [item for sublist in all_vbp for item in sublist]
    vbp = list(set(vbp))
    vbp = [word.replace('\\n', '') for word in vbp]
    vbp = [word.replace('\\', '') for word in vbp]

    vbg = [item for sublist in all_vbg for item in sublist]
    vbg = list(set(vbg))
    vbg = [word.replace('\\n', '') for word in vbg]
    vbg = [word.replace('\\', '') for word in vbg]

    vb = [item for sublist in all_vb for item in sublist]
    vb = list(set(vb))
    vb = [word.replace('\\n', '') for word in vb]
    vb = [word.replace('\\', '') for word in vb]

    return vbp + vbg + vb

In [13]:
def create_singular_list(sentences):
    all_singulars = []

    for s in sentences:
        singular, plural, new_s = check_if_noun(s)
        all_singulars.append(singular)

    singulars = [item for sublist in all_singulars for item in sublist]
    singulars = list(set(singulars))
    singulars = [word.replace('\\n', '') for word in singulars]
    singulars = [word.replace('\\', '') for word in singulars]

    return singulars

In [14]:
def load_words(data_path, data_col, antonyms=False, row_limit=1000):
    # load the data, keeping only desired column and limited number of rows
    df = pd.read_csv(data_path)[[data_col]].head(row_limit)
    
    # get synsets for all words we are interested in (nouns, adjectives, verbs)
    sentences = [elem[0] for elem in df.values.tolist()]

    syn0 = []
    syn0.extend(create_word_list(sentences))

    syn1 = get_antonym_list(syn0) if antonyms else syn0

    all_syn0, d0, ind0 = get_synsets(syn0, pos=None, return_index=True)
    all_syn1, d1, ind1 = get_synsets(syn1, pos=None, return_index=True)
    
    # get all possible words in a list
    all_words = list(set(list(d0.values()) + list(d1.values())))
    
    return all_words
    

In [18]:
def generate_encodings(model_path, word_list, embedding_dim=None):
    # load model
    model = SentenceTransformer(model_path, truncate_dim=embedding_dim, trust_remote_code=True)
    
    # create encodings
    embeddings = model.encode(word_list)
    
    # create mapping from words to their embeddings
    word_embeddings_dict = dict()
    for idx, word in enumerate(word_list):
        word_embeddings_dict[word] = embeddings[idx].tolist()
    
    return word_embeddings_dict
    

## Main Code

In [16]:
# initial parameters
data_paths = ['/kaggle/input/imdb-reviews/imdb_reviews.csv', '/kaggle/input/newsgroups/newsgroups.csv']
data_cols = ['Source_Sentences', 'text']
models = [("mixedbread-ai/mxbai-embed-large-v1", 512), 
         ("avsolatorio/GIST-large-Embedding-v0", None),
         ("jinaai/jina-embeddings-v2-base-en", None),
         ("Labib11/MUG-B-1.6", None)
         ]

In [21]:
# generate and store embeddings using the different models on each dataset
for i in range(2):
    antonyms = not i
    words = load_words(data_paths[i], data_cols[i], antonyms=antonyms)
    
    for model, dim in models:
        # print general_info
        print(f"Model: {model}\nDataset: {data_paths[i]}\nAntonyms: {antonyms}\n")
        
        # get the encodings
        embed_map = generate_encodings(model, words, dim)
        
        # store them in appropirate json file
        json_filename = model.split('/')[-1].split('-')[0] + '_embeddings_' + ('imdb' if i==0 else 'newsgroups')
        with open(f'/kaggle/working/{json_filename}.json', 'w') as f:
            json.dump(embed_map, f)

Model: mixedbread-ai/mxbai-embed-large-v1
Dataset: /kaggle/input/imdb-reviews/imdb_reviews.csv
Antonyms: True



Batches:   0%|          | 0/315 [00:00<?, ?it/s]

Model: avsolatorio/GIST-large-Embedding-v0
Dataset: /kaggle/input/imdb-reviews/imdb_reviews.csv
Antonyms: True



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/68.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Batches:   0%|          | 0/315 [00:00<?, ?it/s]

Model: jinaai/jina-embeddings-v2-base-en
Dataset: /kaggle/input/imdb-reviews/imdb_reviews.csv
Antonyms: True



modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/71.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_bert.py:   0%|          | 0.00/97.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- modeling_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/275M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/373 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/315 [00:00<?, ?it/s]

Model: Labib11/MUG-B-1.6
Dataset: /kaggle/input/imdb-reviews/imdb_reviews.csv
Antonyms: True



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/126k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Batches:   0%|          | 0/315 [00:00<?, ?it/s]

Model: mixedbread-ai/mxbai-embed-large-v1
Dataset: /kaggle/input/newsgroups/newsgroups.csv
Antonyms: False



Batches:   0%|          | 0/239 [00:00<?, ?it/s]

Model: avsolatorio/GIST-large-Embedding-v0
Dataset: /kaggle/input/newsgroups/newsgroups.csv
Antonyms: False



Batches:   0%|          | 0/239 [00:00<?, ?it/s]

Model: jinaai/jina-embeddings-v2-base-en
Dataset: /kaggle/input/newsgroups/newsgroups.csv
Antonyms: False



Batches:   0%|          | 0/239 [00:00<?, ?it/s]

Model: Labib11/MUG-B-1.6
Dataset: /kaggle/input/newsgroups/newsgroups.csv
Antonyms: False



Batches:   0%|          | 0/239 [00:00<?, ?it/s]