In [1]:
# ! pip install -qq datasets==2.16.1

## Import libary and load dataset

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModel, AutoTokenizer
import torch

device = torch.device(
    "cuda") if torch.cuda.is_available() else torch.device("cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cpu')

#### Load dataset

In [3]:
DATASET_NAME = "squad"
raw_datasets = load_dataset(DATASET_NAME)

In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [5]:
raw_datasets['train'][2]

{'id': '5733be284776f41900661180',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?',
 'answers': {'text': ['the Main Building'], 'answer_start': [279]}}

### Preprocessing

In [6]:
MAX_LENGTH = 384
STRIDE = 128
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)



In [7]:
ds_filter = raw_datasets.filter(
        lambda x: len(x['answers']['text']) > 0
    )

In [8]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(question):
    encoded_input = tokenizer(
        question,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)

    return cls_pooling(model_output)

In [9]:
# Áp dụng map() và set_format("torch") để tự động chuyển thành tensor
train = raw_datasets['train'].select(range(100))
val = raw_datasets['validation'].select(range(100))

# Thay 'col1', 'col2' bằng tên cột thực tế
columns_to_remove = ['title', 'answers']
train = train.remove_columns(columns_to_remove)
val = val.remove_columns(columns_to_remove)
train

Dataset({
    features: ['id', 'context', 'question'],
    num_rows: 100
})

In [10]:
get_embeddings(train[0]['question']).size()

torch.Size([1, 768])

In [11]:
import chromadb
from tqdm import tqdm
import json
def add_texts_to_collection(collection: chromadb.Collection, datasets, batch_size=40000):
  """Thêm văn bản và embeddings của chúng vào collection ChromaDB."""
  ids_path = []
  embeddings = []
  answers = []
  for text in tqdm(datasets, total=len(datasets)):
    try:
      embedding_question = get_embeddings(text['question'])
      ids_path.append(text['id'])
      embeddings.append(embedding_question.tolist()[0])

      # lưu thông tin
      selected_text = {'question' : text['question'], 'context': text['context']}
      answers.append(json.dumps(selected_text))
      # Kiểm tra nếu số lượng dữ liệu đã đạt đến batch_size
      if len(ids_path) >= batch_size:

        collection.add(ids=ids_path, embeddings=embeddings, documents=answers)
        ids_path = []  # Reset lại danh sách sau khi thêm vào collection
        embeddings = []
        answers = []
    except Exception as e:
      print(f"Error processing in corpus {text['id']}: {e}")

  # Thêm batch cuối cùng nếu còn dữ liệu
  if ids_path:
    collection.add(ids=ids_path, embeddings=embeddings, documents=answers)

  print(f"Finished adding {len(datasets)} texts to the collection.")

In [12]:
import os
import chromadb

# Đường dẫn database
db_path = r"../database/database_textqa"

# Kiểm tra xem database đã tồn tại chưa
if os.path.exists(db_path):
    print(f"Database already exists at {db_path}. Skipping creation.")
else:
    print("Database does not exist. Creating new database...")

# Khởi tạo client với đường dẫn đã cho
client = chromadb.PersistentClient(path=db_path)

# Tạo hoặc lấy collection nếu chưa tồn tại
collection_textqa = client.get_or_create_collection(
    name='textqa_collection',
    metadata={"hnsw:space": "cosine"}
)

# Thêm văn bản vào collection
add_texts_to_collection(collection_textqa, train)

Database does not exist. Creating new database...


100%|██████████| 100/100 [00:03<00:00, 27.30it/s]


Finished adding 100 texts to the collection.


In [13]:
def search(question, collection, n_results):
    query_embedding = get_embeddings(question).tolist()[0]
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )
    return results

result = search('When did Beyonce start becoming popular ?', collection_textqa, 5)
result

{'ids': [['5733b2fe4776f41900661090',
   '5733bf84d058e614000b61be',
   '5733b5344776f419006610df',
   '5733b5344776f419006610e0',
   '57338653d058e614000b5c81']],
 'distances': [[0.06992483139038086,
   0.07138818502426147,
   0.07344293594360352,
   0.07380998134613037,
   0.07437008619308472]],
 'metadatas': [[None, None, None, None, None]],
 'embeddings': None,
 'documents': [['{"question": "When did study of a germ-free-life begin at Notre Dame?", "context": "The Lobund Institute grew out of pioneering research in germ-free-life which began in 1928. This area of research originated in a question posed by Pasteur as to whether animal life was possible without bacteria. Though others had taken up this idea, their research was short lived and inconclusive. Lobund was the first research organization to answer definitively, that such life is possible and that it can be prolonged through generations. But the objective was not merely to answer Pasteur\'s question but also to produce the 