In [None]:
import os
import json
import math
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm

import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
# CONFIG (adjust paths)
DATASET_PATH = "../../data/En-Ba-Dataset(20k_4)/dataset.csv"
OUTPUT_PATH = "embeddings.csv"
VOCAB_PATH = "vocabulary.json"
MODEL_NAME = "bert-base-multilingual-cased"
BITS_PER_NUMBER = 8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
MAX_LENGTH = 512

print("Using device:", DEVICE)

In [None]:
# ---------------- VOCAB (optional inspection only) ----------------
def build_vocab(sentences):
    counter = Counter()
    for s in sentences:
        tokens = s.strip().split()
        counter.update(tokens)
    items = counter.most_common()
    vocab = {word: {"index": idx, "count": cnt} for idx, (word, cnt) in enumerate(items)}
    return vocab

def save_vocab(vocab, filepath):
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(vocab, f, indent=2, ensure_ascii=False)
    print(f"Saved vocab ({len(vocab)} words) to {filepath}")
    for i, (w, info) in enumerate(vocab.items()):
        if i >= 20: break
        print(f"{w}: {info['count']}")

In [None]:
# ---------------- BERT SETUP ----------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

In [None]:
# ---------------- LOAD DATA ----------------
df = pd.read_csv(DATASET_PATH)
sentences = df["Sentence"].astype(str).tolist()
labels = df["Label"].tolist()
print("Loaded", len(sentences), "sentences")

# Build vocab (analysis only, doesn’t affect embeddings)
vocab = build_vocab(sentences)
save_vocab(vocab, VOCAB_PATH)

In [None]:
# ---------------- EMBEDDING ----------------
def embed_batch(texts):
    inputs = tokenizer(
        texts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH
    )
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.no_grad():
        out = model(**inputs)
        cls_embs = out.last_hidden_state[:, 0, :].cpu().numpy()
    return cls_embs


all_embs = []
for i in tqdm(range(0, len(sentences), BATCH_SIZE), desc="Embedding"):
    batch_texts = [
        s.strip() for s in sentences[i : i + BATCH_SIZE]
    ]  # only strip whitespace
    embs = embed_batch(batch_texts)
    all_embs.append(embs)

embedded_arr = np.vstack(all_embs)
print("Embedding shape:", embedded_arr.shape)

In [None]:
# ---------------- NORMALIZATION ----------------
col_min = embedded_arr.min(axis=0)
col_max = embedded_arr.max(axis=0)
scale = np.where(col_max - col_min == 0, 1, col_max - col_min)

normalized = 1 + (embedded_arr - col_min) * (254.0 / scale)
normalized = np.round(normalized).astype(np.int32)

print("Numeric value range:", normalized.min(), normalized.max())
assert 1 <= normalized.min() and normalized.max() <= 255

num_df = pd.DataFrame(normalized)
num_df.to_csv(OUTPUT_PATH, index=False)
print(f"Saved numeric embeddings to {OUTPUT_PATH}")