## (二) Use Chroma database to implement a small simple QA system.

In [1]:
# %pip install -Uq chromadb numpy datasets

In [2]:
# Get the SciQ dataset from HuggingFace
from datasets import load_dataset

dataset = load_dataset("sciq", split="train")

# Filter the dataset to only include questions with a support
dataset = dataset.filter(lambda x: x["support"] != "")

print("Number of questions with support: ", len(dataset))

Number of questions with support:  10481


In [3]:
# Import Chroma and instantiate a client. The default Chroma client is ephemeral, meaning it will not save to disk.
import chromadb
client = chromadb.Client()

In [4]:
# Create a new Chroma collection to store the supporting evidence. We don't need to specify an embedding fuction, and the default will be used.
collection = client.create_collection("sciq_supports")

In [5]:
# Embed and store the first 100 supports for this demo
collection.add(
    ids=[str(i) for i in range(0, 100)],  # IDs are just strings
    documents=dataset["support"][:100],
    metadatas=[{"type": "support"} for _ in range(0, 100)
    ],
)

In [6]:
# 獲取使用者的查詢 #"Question: What is the least dangerous radioactive decay?
query = input("Question: ")

# 檢查是否為空查詢
if len(query) == 0:
    print("Please enter a question. Ctrl+C to Quit.\n")
    
# 處理問題...
print(f"\nThinking...\n")

# 印出問題和相應的資料
print("-----")

Question: What is the least dangerous radioactive decay?

Thinking...

-----


In [7]:
# 從資料集中查詢相應的文件
results = collection.query(
query_texts=[query], n_results=1, include=["documents", "metadatas"])

for i, R in enumerate(results['documents'][0], 1):
    print("Response", i, ":")
    print(R)
    print("-----")

Response 1 :
All radioactive decay is dangerous to living things, but alpha decay is the least dangerous.
-----
