In [2]:
import os
import csv
import pickle
from pathlib import Path
from sentence_transformers import SentenceTransformer

# Paths
DATA_CSV = Path("data/processed/ljp_data.csv")
EMBED_DIR = Path("data/embeddings")
EMBED_DIR.mkdir(parents=True, exist_ok=True)

# Chunk size
CHUNK_SIZE = 500  # words per chunk

In [3]:
def chunk_text(text, chunk_size=CHUNK_SIZE):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks

In [4]:
rows = []
with open(DATA_CSV, encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        case_id = row["case_id"]
        text = row["text"]
        label = row["label"]

        chunks = chunk_text(text)
        for idx, chunk in enumerate(chunks):
            rows.append({"case_id": f"{case_id}_{idx}", "text": chunk, "label": label})

print(f"Total chunks to embed: {len(rows)}")

Total chunks to embed: 577


In [5]:
embedder = SentenceTransformer("all-mpnet-base-v2")

texts = [r["text"] for r in rows]
embeddings = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)

print("Embeddings generated successfully!")

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Embeddings generated successfully!


In [6]:
with open(EMBED_DIR / "embeddings.pkl", "wb") as f:
    pickle.dump({"embeddings": embeddings, "metadata": rows}, f)

print(f"Saved embeddings and metadata to {EMBED_DIR}")

Saved embeddings and metadata to data/embeddings
