In [1]:
import os
import openai
import sys
import glob
import tiktoken
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

open.api_key = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.document_loaders import PyPDFLoader

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [4]:
glob.glob("data/*.pdf")

['data/state_of_the_union.pdf',
 'data/silverman-openai-complaint.pdf',
 'data/The_Effect_of_Student_Teacher_Ratio_on_Truancy.pdf',
 'data/Question_Generation.pdf',
 'data/fec_2016_EDA.v2.pdf',
 'data/exploring-ggplot.pdf']

In [5]:
loaders = [PyPDFLoader(pdf) for pdf in glob.glob("data/*.pdf")]

In [6]:
loaders

[<langchain.document_loaders.pdf.PyPDFLoader at 0x7f048e805c00>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f048e805f30>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f045df2ccd0>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f045df2cfa0>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f045df2cee0>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f045df2ce20>]

In [7]:
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [8]:
len(docs)

88

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [10]:
splits = text_splitter.split_documents(docs)

In [11]:
len(splits)

187

In [12]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [13]:
from langchain.vectorstores import Chroma

In [14]:
persist_directory = 'chroma/'

In [15]:
!rm -rf chroma  # remove old database files if any

In [16]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [17]:
print(vectordb._collection.count())

187


In [18]:
question = "Who is Sarah Silverman?"

In [19]:
docs = vectordb.similarity_search(question,k=10)

In [20]:
len(docs)

10

In [21]:
docs[2].page_content

'1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28 \n 1  \nCOMPLAINT \n Plaintiffs Sarah Silverman, Christopher Golden, and Richard Kadrey (“Plainti ffs”), on behalf of \nthemselves and all others similarly situated, bring this Class Action Complaint (the “Complaint”) against Defendants OpenAI, Inc., OpenAI, L.P., Op enAI OpCo, L.L.C., Open AI GP, L.L.C., OpenAI \nStartup Fund I, L.P., OpenAI Startup Fund GP I,  L.L.C. and OpenAI Startup Fund Management, LLC \nfor direct copyright infringement, vicarious copyright in fringement, violations of section 1202(b) of the \nDigital Millennium Copyright Act, unjust enrichment, violations of the California and common law unfair competition laws, and negligence. Plainti ffs seek injunctive relief an to recover damages as a \nresult and consequence of Defendants’ unlawful conduct. \nI. OVERVIEW \n1. ChatGPT is a software product created, maintained, and sold by OpenAI.  \n2. ChatGPT is powered b

In [22]:
vectordb.persist()

In [23]:
for doc in docs:
    print(doc.metadata)

{'page': 2, 'source': 'data/silverman-openai-complaint.pdf'}
{'page': 0, 'source': 'data/silverman-openai-complaint.pdf'}
{'page': 1, 'source': 'data/silverman-openai-complaint.pdf'}
{'page': 16, 'source': 'data/state_of_the_union.pdf'}
{'page': 0, 'source': 'data/state_of_the_union.pdf'}
{'page': 13, 'source': 'data/state_of_the_union.pdf'}
{'page': 15, 'source': 'data/state_of_the_union.pdf'}
{'page': 12, 'source': 'data/state_of_the_union.pdf'}
{'page': 14, 'source': 'data/state_of_the_union.pdf'}
{'page': 9, 'source': 'data/state_of_the_union.pdf'}
