In [1]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
import os
load_dotenv()

current_file_path = os.path.dirname(os.path.abspath(__file__))
persist_directory = os.path.join(
    current_file_path, '..', '..', 'vectorstores', 'how_close_is_chatgpt_to_human_experts')
# persist_directory="../../vectorstores/how_close_is_chatgpt_to_human_experts"
# persist_directory = "vectorstores/how_close_is_chatgpt_to_human_experts"

client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    # Optional, defaults to .chromadb/ in the current directory
    persist_directory=persist_directory
))

embedding_function = embedding_functions.OpenAIEmbeddingFunction(
    model_name="text-embedding-ada-002", api_key=os.environ["OPENAI_API_KEY"])

collection = client.get_or_create_collection(
    name="how_close_is_chatgpt_to_human_experts", embedding_function=embedding_function)


Using embedded DuckDB with persistence: data will be stored in: vectorstores/how_close_is_chatgpt_to_human_experts


In [2]:
from langchain.document_loaders import PyPDFLoader

pdf_path = "./How_Close_is_ChatGPT_to_Human_Experts.pdf"
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()
print(pages[0])

page_content='How Close is ChatGPT to Human Experts?\nComparison Corpus, Evaluation, and Detection\nBiyang Guo1y\x03, Xin Zhang2\x03, Ziyuan Wang1\x03, Minqi Jiang1\x03, Jinran Nie3\x03\nYuxuan Ding4, Jianwei Yue5, Yupeng Wu6\n1AI Lab, School of Information Management and Engineering\nShanghai University of Finance and Economics\n2Institute of Computing and Intelligence, Harbin Institute of Technology (Shenzhen)\n3School of Information Science, Beijing Language and Culture University\n4School of Electronic Engineering, Xidian University\n5School of Computing, Queen’s University,6Wind Information Co., Ltd\nAbstract\nThe introduction of ChatGPT2has garnered widespread attention in both academic\nand industrial communities. ChatGPT is able to respond effectively to a wide range\nof human questions, providing ﬂuent and comprehensive answers that signiﬁcantly\nsurpass previous public chatbots in terms of security and usefulness. On one hand,\npeople are curious about how ChatGPT is able to 

In [3]:
# create an array of page_contents from the pages
documents = [page.page_content for page in pages]
ids = [str(i) for i in range(1, len(pages) + 1)]
# ids = [page.metadata["page"] for page in pages]

# add the documents to the collection
collection.add(
    documents=documents,
    ids=ids
)

In [4]:
results = collection.query(
    query_texts=["What is this paper about in general?"],
    n_results=3,
)

print(results)


{'ids': [['8', '22', '3']], 'embeddings': None, 'documents': [['Overall, these summarised features indicate that ChatGPT has improved notably in question-\nanswering tasks for a wide range of domains. Compared with humans, we can imagine ChatGPT\nas a conservative team of experts. As a "team", it may lack individuality but can have a more\ncomprehensive and neutral view towards questions.\n6', 'for low-resource text classiﬁcation. arXiv preprint arXiv:2209.01560 , 2022.\n[17] Songqiao Han, Xiyang Hu, Hailiang Huang, Minqi Jiang, and Yue Zhao. Adbench: Anomaly\ndetection benchmark. Advances in Neural Information Processing Systems (NeurIPS) , 2022.\n17', 'web-crawled text, books, and codes, making it able to respond to all kinds of questions. Therefore,\nwe are curious how will a human (especially an expert) and ChatGPT respond to the same question\nrespectively. Inspired by [ 1], we also want to evaluate whether ChatGPT can keep honest (not\nfabricate information or mislead the user), 

In [5]:
client.persist()

True