# Embedding

In [5]:
import pandas as pd
import numpy as np  

from sentence_transformers import SentenceTransformer, util

specs_df = pd.read_json('../2-chunking/chunks_specs.jsonl', lines=True)
descs_df = pd.read_json('../2-chunking/chunks_description.jsonl', lines=True)

print(f"Specs: {len(specs_df)}")
print(f"Descs: {len(descs_df)}")

all_chunks = pd.concat([specs_df, descs_df], ignore_index=True)

Specs: 4208
Descs: 411


## Daten aufbereiten

In [9]:
documents = all_chunks['document'].tolist()

no_empty_mask = all_chunks['document'].str.len() > 3
all_chunks_clean = all_chunks[no_empty_mask].reset_index(drop=True)
documents_clean = all_chunks_clean['document'].tolist()

## Embeddings generieren

In [11]:
model = SentenceTransformer('deepset/gbert-large')

embeddings = model.encode(
    documents_clean,
    batch_size=16,
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_numpy=True
)

np.save('./embeddings_gbert.npy', embeddings)

No sentence-transformers model found with name deepset/gbert-large. Creating a new one with mean pooling.
Batches: 100%|██████████| 289/289 [16:35<00:00,  3.45s/it] 


In [None]:
print(f"Shape: {embeddings.shape}")
print(f"Dtype: {embeddings.dtype}")

Shape: (4618, 1024)
Dtype: float32
