In [1]:
import torch
def check_mps():
    if torch.backends.mps.is_available():
        print("MPS is available")
    else:
        print("MPS is not available")
        return False

    if torch.backends.mps.is_built():
        print("MPS is built")
    else:
        print("MPS is not built")
        return False
    return True

check_mps()


MPS is available
MPS is built


True

In [39]:
target_dictionary = "gene_chin" # or "stephen_li"
model_name = "all-MiniLM-L6-v2"

# Load model and move to GPU
if torch.cuda.is_available():
    device = "cuda"
elif check_mps():
    device = "mps"
else:
    print("No GPU available, using CPU")
    device = "cpu"


MPS is available
MPS is built


In [40]:
import json
from langchain_huggingface import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from tqdm import tqdm  # Import tqdm for progress bar

# Load JSON list
with open(f"../data/{target_dictionary}_dictionary.json", "r", encoding="utf-8") as f:
    data = json.load(f)  # list of dictionaries

# Create documents
docs = []
for entry in data:
    trad = entry.get("TRAD", "")
    simp = entry.get("SIMP", "")
    jyutping = entry.get("JYUTPING", "")
    penyim = entry.get("PENYIM", "")
    defn = entry.get("DEFN", "")

    content = f"Traditional: {trad}\nSimplified: {simp}\nJyutping: {jyutping}\nTaishanese (Penyim): {penyim}\nDefinition: {defn}"
    docs.append(Document(page_content=content, metadata=entry))

# Optionally split long docs
splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
split_docs = splitter.split_documents(docs)

# Your documents
documents = split_docs  # Assume this is already created

# Initialize HuggingFaceEmbeddings with the model name
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={'device': device} # Pass device as a model_kwarg
)

# Prepare text
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]

# Embed in batches (HuggingFaceEmbeddings handles batching internally)
all_embeddings = []

for i in tqdm(range(0, len(texts), 100), desc="Embedding"): # Use a reasonable batch size
    batch = texts[i:i + 100]
    embeddings = embedding_model.embed_documents(batch)
    all_embeddings.extend(embeddings)

Embedding: 100%|██████████| 819/819 [04:31<00:00,  3.01it/s]


In [42]:
# Rebuild docs (FAISS needs LangChain Documents)
reconstructed_docs = [Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts))]

# Reconstruct (text, embedding) tuples
text_embedding_pairs = list(zip(texts, all_embeddings))

# Create FAISS vector store
vectorstore = FAISS.from_embeddings(text_embedding_pairs, embedding_model)

# Save locally
vectorstore.save_local(f"{target_dictionary}_{model_name}_embeddings")

In [43]:
vectorstore = FAISS.load_local(f"{target_dictionary}_{model_name}_embeddings", embedding_model, allow_dangerous_deserialization=True)
# retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":5})

In [None]:
docs_and_scores = vectorstore.similarity_search_with_score("I am very happy", k=50)
docs_and_scores = sorted(docs_and_scores, key=lambda x: x[1], reverse=True)

print("🔍 Similarity Search Results for 'happy'")
print("=" * 50)

for i, (doc, score) in enumerate(docs_and_scores, 1):
    print(f"\n📍 Result #{i}")
    print(f"   Similarity Score: {score:.4f}")
    print(f"   {'─' * 40}")
    
    # Parse the content to extract structured information
    lines = doc.page_content.strip().split('\n')
    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            key = key.strip()
            value = value.strip()
            
            # Add emoji indicators for different fields
            if key == "Traditional":
                print(f"   🈶 {key}: {value}")
            elif key == "Simplified":
                print(f"   🈹 {key}: {value}")
            elif key == "Jyutping":
                print(f"   🔊 {key}: {value}")
            elif key == "Taishanese (Penyim)":
                print(f"   🎭 {key}: {value}")
            elif key == "Definition":
                print(f"   📚 {key}: {value}")
            else:
                print(f"   ➡️  {key}: {value}")
    
    if i < len(docs_and_scores):
        print(f"   {'─' * 40}")

print(f"\n✅ Found {len(docs_and_scores)} results")

🔍 Similarity Search Results for 'happy'

📍 Result #1
   Similarity Score: 1.6068
   ────────────────────────────────────────
   🈶 Traditional: [None]
   🈹 Simplified: ['笑嘻嘻']
   🔊 Jyutping: ['xël-hī-hī']
   🎭 Taishanese (Penyim): ['xiàoxīxī']
   📚 Definition: grinning; smiling broadly.⁵
   ────────────────────────────────────────

📍 Result #2
   Similarity Score: 1.6068
   ────────────────────────────────────────
   🈶 Traditional: [None]
   🈹 Simplified: ['佞幸']
   🔊 Jyutping: ['nèin-hàng']
   🎭 Taishanese (Penyim): ['nìngxìng']
   📚 Definition: flattering courtiers; favorite of king.¹¹
   ────────────────────────────────────────

📍 Result #3
   Similarity Score: 1.6056
   ────────────────────────────────────────
   🈶 Traditional: ['賠笑']
   🈹 Simplified: ['赔笑']
   🔊 Jyutping: ['põi-xël']
   🎭 Taishanese (Penyim): ['péixiào']
   📚 Definition: smile placatingly or apologetically; smile a fawning smile.⁶
   ────────────────────────────────────────

📍 Result #4
   Similarity Score: 1.6035
 