# Embedding

In [24]:
import random
import pandas as pd
import numpy as np 

from sentence_transformers import SentenceTransformer, util

summs_df = pd.read_json('../1-normalisation+chunking/summs_chunks.jsonl', lines=True)
descs_df = pd.read_json('../1-normalisation+chunking/descs_chunks.jsonl', lines=True)
specs_df = pd.read_json('../1-normalisation+chunking/specs_chunks.jsonl', lines=True)

all_chunks = pd.concat([summs_df, descs_df, specs_df], ignore_index=True)

## Daten bereinigen und speichern

In [22]:
all_chunks['document'] = all_chunks['document'].str.strip()

no_empty_mask = all_chunks['document'].str.len() > 10
all_chunks_clean = all_chunks[no_empty_mask]

no_duplicates = all_chunks_clean.drop_duplicates(subset=['document'])
all_chunks_clean = no_duplicates.reset_index(drop=True)

documents_clean = all_chunks_clean['document'].tolist()
all_chunks_clean.to_json('./chunks_metadata.jsonl', orient='records', lines=True)

## Embeddings generieren

In [20]:
model = SentenceTransformer('deepset/gbert-large')

embeddings = model.encode(
    documents_clean,
    batch_size = 16,
    show_progress_bar = True,
    normalize_embeddings = True,
    convert_to_numpy = True,
)

np.save('./embeddings_gbert.npy', embeddings)

Batches: 100%|██████████| 113/113 [17:40<00:00,  9.39s/it]


## Daten validieren

In [None]:
# Längenergleich
assert len(embeddings) == len(documents_clean)

# Stichproben
sample_idx = random.sample(range(len(embeddings)), int(len(embeddings) * 0.01))
for idx in sample_idx:
    text = all_chunks_clean.iloc[idx]['document']
    embd = embeddings[idx]
    norm = np.linalg.norm(embd)
    test = model.encode([text], normalize_embeddings=True)[0]
    similarity = np.dot(embd, test)

    print(f"Index: {idx}")
    print(f"Text: {text[:60]}...")
    print(f"Shape: {embd}, Norm: {norm:.4f}")
    print(f"Similarity: {similarity:.8f}")

assert not np.any(np.isnan(embeddings))
assert not np.any(np.isinf(embeddings))

print(f"Shape: {embeddings.shape}")
print(f"Dtype: {embeddings.dtype}")

Index: 48
Text: Der LKv 3910 MediLine von Liebherr ist ein hochwertiger Labo...
Shape: [ 0.00279668  0.00016322 -0.00074209 ...  0.0040718  -0.00466464
 -0.0158316 ], Norm: 1.0000
Similarity: 0.99999964
Index: 1287
Text: Innenausstattung - Anzahl Ablageflächen: 6, Belastbarkeit Ab...
Shape: [-0.00252282 -0.00125176  0.00313226 ... -0.00065846 -0.010732
 -0.02203418], Norm: 1.0000
Similarity: 0.99999994
Index: 726
Text: Die Tiefkühltruhe bietet eine präzise Temperaturregelung von...
Shape: [ 0.00481904  0.00833065  0.00581945 ... -0.00121045  0.00394187
 -0.01110826], Norm: 1.0000
Similarity: 1.00000012
Index: 1323
Text: Energie - Normalverbrauch-kwh 24h: 3.18, Wärmeabgabe-watt: 4...
Shape: [ 0.03575942  0.00403371  0.00635387 ...  0.00344834  0.00082015
 -0.02038387], Norm: 1.0000
Similarity: 0.99999994
Index: 1632
Text: Innenausstattung - Anzahl Ablageflächen: 4, Belastbarkeit Ab...
Shape: [-1.1946378e-05 -2.2963989e-03  2.3680725e-03 ...  1.3620813e-03
 -8.4223058e-03 -2.1856196e-02]