In [1]:
import pandas as pd
from tqdm.auto import tqdm,trange

In [2]:
def import_labelled_data(path="data/labelled/data.json", group_relevant=True):
    data = pd.read_json(path, encoding="latin-1")
    if group_relevant:
        data["class"] = data["class"].apply(
            lambda x: "relevant" if x != "irrelevant" else x
        )
    return data


print("Loading data...")

data = import_labelled_data(
    path="../../data/level-0.5/data.json", group_relevant=False
)

print("Data loaded.")

data = data.sample(frac=1).reset_index(drop=True)

data = data.sample(250)



data.info()

Loading data...
Data loaded.
<class 'pandas.core.frame.DataFrame'>
Index: 250 entries, 5902 to 498
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           250 non-null    object
 1   text          250 non-null    object
 2   relevance     250 non-null    object
 3   multiclasses  250 non-null    object
dtypes: object(4)
memory usage: 9.8+ KB


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count = CountVectorizer()
tfidf = TfidfVectorizer()
count.fit(data["text"])
tfidf.fit(data["text"])

In [None]:
from sentence_transformers import SentenceTransformer

sources = [
    # 'dunzhang/stella_en_400M_v5',
    # "avsolatorio/GIST-small-Embedding-v0",
    # "avsolatorio/GIST-Embedding-v0",
    # "Alibaba-NLP/gte-base-en-v1.5",
    # "allenai/longformer-base-4096",
    'jinaai/jina-embeddings-v2-small-en'
]

In [None]:
embedders = []
for source in sources:
    print(f"Loading {source}...")
    model = SentenceTransformer(source,trust_remote_code=True)
    embedders.append(model)
    print('Maximum sequence length:',model.max_seq_length)
    model.max_seq_length = min(model.max_seq_length,4096)


Loading jinaai/jina-embeddings-v2-small-en...
Maximum sequence length: 8192


In [None]:
sequence_lengths = set([
    model.max_seq_length for model in embedders
])

sequence_lengths

{4096}

In [None]:
from chunking import chunk_dataset_and_explode

chunked_data = {}

for sequence_length in sequence_lengths:
    chunked_data[sequence_length] = chunk_dataset_and_explode(data, sequence_length,overlap=int(sequence_length*0.2))

  0%|          | 0/250 [00:00<?, ?it/s]

In [None]:
from datasets import Dataset

datasets = {}

for sequence_length in sequence_lengths:
    datasets[sequence_length] = Dataset.from_pandas(chunked_data[sequence_length])



In [None]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))
# get version of cuda
print(torch.version.cuda)
# clear cache
torch.cuda.empty_cache()

True
0
NVIDIA GeForce RTX 3090
12.1


In [None]:
# time encoding

embeddings = dict.fromkeys(embedders,None)

for model in embedders:
    print(f"Embedding {model}")

    embeddings[model] = train_embeddings = model.encode(
    datasets[model.max_seq_length]["text"],
    show_progress_bar=True,
    batch_size=8,
    precision='float32',
)

Embedding SentenceTransformer(
  (0): Transformer({'max_seq_length': 4096, 'do_lower_case': False}) with Transformer model: JinaBertModel 
  (1): Pooling({'word_embedding_dimension': 512, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


Batches:   0%|          | 0/637 [00:00<?, ?it/s]

In [None]:
%time count_txt = count.transform(data["text"])
%time tfidf_txt = tfidf.transform(data["text"])

CPU times: user 3.74 s, sys: 80 ms, total: 3.82 s
Wall time: 3.79 s
CPU times: user 3.7 s, sys: 84.6 ms, total: 3.78 s
Wall time: 3.75 s
