In [5]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm
import ast
import re

import faiss
from uuid import uuid4

from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document

# ======= config path =======
BASE_DIR = '/project/lt200304-dipmt/paweekorn'
MODEL_PATH = f"{BASE_DIR}/models/retriever/bge-m3"
RESULT_DIR = f"{BASE_DIR}/vector/th2en/{os.path.basename(MODEL_PATH)}"

## Overview

In [2]:
def clean_parenthesis(text):
    text = text.replace('[', '(')
    text = text.replace(']', ')')
    return text

def clean_thai_spacing(text):
    filtered = re.findall(r'[^\u0E00-\u0E7F]', text)
    if all([x.isspace() for x in filtered]):
        text = re.sub(r'\s', '', text)
    return text

In [3]:
test_df = pd.read_csv(f"{BASE_DIR}/data/DS01/test_v1.csv")

unique_df = pd.read_csv(f"{BASE_DIR}/data/unique_no_test.csv")
unique_df['ENG'] = unique_df['ENG'].apply(clean_parenthesis)
unique_df['THA'] = unique_df['THA'].apply(clean_parenthesis)
unique_df['NAME'] = unique_df['NAME'].apply(lambda x: ast.literal_eval(x)[0])

unique_df = unique_df.drop_duplicates('ENG')
unique_df = unique_df[ unique_df['ENG'].apply(lambda x: x not in test_df['ENG'].tolist()) ]

print(unique_df.shape)
unique_df.head()

(191751, 3)


Unnamed: 0,ENG,THA,NAME
0,"(Animal) skin, pelt","(สัตว์) หนัง, ขนสัตว์",18
1,(IaaS) infrastructure a a service,ให้บริการโครงสร้างพื้นฐานด้านไอที (ไอเอเอเอส),42
2,(abrasive preparation) soap,(สารที่เตรียมขึ้นใช้ขัด) สบู่,3
3,all good of textile,สินค้าทั้งหมดที่ทำจากสิ่งทอ,24
4,(audio-video) disc,(เสียง-วีดีโอ) แผ่นดิสก์,9


In [4]:
documents = []
for _, row in tqdm(unique_df.iterrows()):
    content = row['THA']
    meta = {'english': row['ENG'], 'wipo': row['NAME']}
    documents.append(Document(page_content=content, metadata=meta))

uuids = [str(uuid4()) for _ in range(len(documents))]
documents[0]

191751it [00:06, 27550.35it/s]


Document(metadata={'english': '(Animal) skin, pelt', 'wipo': '18'}, page_content='(สัตว์) หนัง, ขนสัตว์')

## vectorstore

In [6]:
embedding_model = HuggingFaceEmbeddings(model_name=MODEL_PATH)
d_model = len(embedding_model.embed_query('Hello World'))
d_model

1024

In [7]:
res = faiss.StandardGpuResources()

index_flat = faiss.IndexFlatL2(d_model)
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

# make it into a gpu index
vector_store = FAISS(
    embedding_function=embedding_model,
    index=gpu_index_flat,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
doc_ids = vector_store.add_documents(documents=documents, ids=uuids)
print(doc_ids[:5])

['59726bf8-615b-4a08-9847-c68deed02e82', '152e6030-7fc9-414d-9d52-3e033f97c4cb', '7b4f7330-7f45-4669-bcb1-cb9fb2202417', 'e9dd7966-c0e7-470a-9739-bcddae35caf0', '797d706e-f05d-42df-902d-45a69e30bb5d']


In [8]:
# Move the index to CPU before saving
os.makedirs(f"{RESULT_DIR}", exist_ok=True)
cpu_index = faiss.index_gpu_to_cpu(vector_store.index)
faiss.write_index(cpu_index, f"{RESULT_DIR}/index.faiss")

# Re-create the FAISS vector store from the saved index, docstore, and index_to_docstore_id
vector_store_loaded = FAISS(
    embedding_function=embedding_model,
    index=cpu_index,
    docstore=vector_store.docstore,
    index_to_docstore_id=vector_store.index_to_docstore_id,
)

# Now you can save the other components using save_local
vector_store_loaded.save_local(RESULT_DIR)

## Demo

In [10]:
def get_relevant_docs(query, k=3):
    docs = vector_store.similarity_search(query, k=k)

    relevant = ""
    for i, doc in enumerate(docs[1:]):
        relevant += f'''THAI: {doc.page_content}
ENG: {doc.metadata['english']}
\n'''

    return relevant

sample = "ปลั๊ก"
print(f"Source: {sample}\n")

print("## Retrieved References")
print(get_relevant_docs(sample))

Source: ปลั๊ก

## Retrieved References
THAI: ปลั๊ก
ENG: plug

THAI: ปลั๊กไฟ
ENG: electrical plug


