In [1]:
import json
from pathlib import Path

def get_base_dir() -> Path:
    try:
        return Path(__file__).resolve().parent
    except NameError:
        return Path.cwd()

BASE_DIR = get_base_dir().parent  # go one directory up
OUTPUTS_DIR = BASE_DIR / "outputs"

pecha_files = list((OUTPUTS_DIR/"chunks").rglob("*.json"))
pechas = [json.loads(pecha_file.read_text()) for pecha_file in pecha_files]

print(f"Number of pechas : {len(pechas)}")


Number of pechas : 268


## Embedding

In [2]:
!pip install faiss-cpu transformers torch torchvision



In [3]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
model = AutoModel.from_pretrained("sentence-transformers/LaBSE")


def get_bert_embedding(text):
    # Tokenize input for BERT
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

    # Get BERT outputs
    outputs = model(**inputs)

    # Use pooled output as embedding (CLS token)
    pooled_output = outputs.pooler_output
    return pooled_output.detach()


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pickle
from tqdm import tqdm

# Generate BERT Embeddings for the documents
embeddings = []
metadata = []

for pecha in tqdm(pechas, desc="Generating BERT Embeddings"):
    pass 
    pecha_id = pecha["pecha_id"]
    pecha_chunks = pecha["chunks"]

    num_of_volumes = len(list(pecha_chunks.keys()))

    # If pecha has only one volume
    if num_of_volumes <= 1:
        first_vol = list(pecha_chunks.keys())[0]

        for i, chunk in enumerate(pecha_chunks[first_vol]):
            embedding = get_bert_embedding(chunk)
            embeddings.append(embedding)
            metadata.append({'pecha_source': pecha_id, 'chunk_index': i})
    # If pecha has multiple volumes
    else:
        for volume_id, volume_text in pecha_chunks.items():
            for i, chunk in enumerate(volume_text):
                embedding = get_bert_embedding(chunk)
                embeddings.append(embedding)
                metadata.append({'pecha_source': f"{pecha_id}_{volume_id}", 'chunk_index': i})

bert_embeddings_tensor = torch.cat(embeddings, dim=0)

# Save embeddings and metadata to pickle file
embeddings_data = {
    'embeddings': bert_embeddings_tensor.cpu().numpy(),  # Convert to numpy for smaller file size
    'metadata': metadata,
    'shape': bert_embeddings_tensor.shape,
    'total_chunks': len(metadata)
}

with open('bert_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_data, f)

# Information about the BERT embeddings
print(f"Total number of paragraphs with embeddings: {len(metadata)}")
print(f"Shape of concatenated embeddings tensor: {bert_embeddings_tensor.shape}")
