In [12]:
import tensorflow as tf

In [13]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [14]:
import os
import openai
import sys
import glob
import tiktoken
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

open.api_key = os.environ['OPENAI_API_KEY']

In [15]:
from langchain.document_loaders import PyPDFLoader

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [17]:
glob.glob("data/*.pdf")

['data/state_of_the_union.pdf',
 'data/silverman-openai-complaint.pdf',
 'data/The_Effect_of_Student_Teacher_Ratio_on_Truancy.pdf',
 'data/Question_Generation.pdf',
 'data/fec_2016_EDA.v2.pdf',
 'data/exploring-ggplot.pdf']

In [18]:
loaders = [PyPDFLoader(pdf) for pdf in glob.glob("data/*.pdf")]

In [19]:
loaders

[<langchain.document_loaders.pdf.PyPDFLoader at 0x7f8a1ff5fb80>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f8bc3d2fee0>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f8a1fd9ab90>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f8a1fd9ae60>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f8a1fd9ada0>,
 <langchain.document_loaders.pdf.PyPDFLoader at 0x7f8a1fd9ace0>]

In [20]:
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [21]:
len(docs)

88

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [23]:
splits = text_splitter.split_documents(docs)

In [24]:
len(splits)

187

In [25]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [26]:
from langchain.vectorstores import Chroma

In [27]:
persist_directory = 'chroma/'

In [28]:
!rm -rf chroma  # remove old database files if any

In [29]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [30]:
print(vectordb._collection.count())

187


In [31]:
question = "Who is Sarah Silverman?"

In [32]:
docs = vectordb.similarity_search(question,k=10)

In [33]:
len(docs)

10

In [34]:
docs[2].page_content

'1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28 \n 1  \nCOMPLAINT \n Plaintiffs Sarah Silverman, Christopher Golden, and Richard Kadrey (“Plainti ffs”), on behalf of \nthemselves and all others similarly situated, bring this Class Action Complaint (the “Complaint”) against Defendants OpenAI, Inc., OpenAI, L.P., Op enAI OpCo, L.L.C., Open AI GP, L.L.C., OpenAI \nStartup Fund I, L.P., OpenAI Startup Fund GP I,  L.L.C. and OpenAI Startup Fund Management, LLC \nfor direct copyright infringement, vicarious copyright in fringement, violations of section 1202(b) of the \nDigital Millennium Copyright Act, unjust enrichment, violations of the California and common law unfair competition laws, and negligence. Plainti ffs seek injunctive relief an to recover damages as a \nresult and consequence of Defendants’ unlawful conduct. \nI. OVERVIEW \n1. ChatGPT is a software product created, maintained, and sold by OpenAI.  \n2. ChatGPT is powered b

In [35]:
vectordb.persist()

In [36]:
for doc in docs:
    print(doc.metadata)

{'page': 2, 'source': 'data/silverman-openai-complaint.pdf'}
{'page': 0, 'source': 'data/silverman-openai-complaint.pdf'}
{'page': 1, 'source': 'data/silverman-openai-complaint.pdf'}
{'page': 16, 'source': 'data/state_of_the_union.pdf'}
{'page': 0, 'source': 'data/state_of_the_union.pdf'}
{'page': 13, 'source': 'data/state_of_the_union.pdf'}
{'page': 15, 'source': 'data/state_of_the_union.pdf'}
{'page': 12, 'source': 'data/state_of_the_union.pdf'}
{'page': 14, 'source': 'data/state_of_the_union.pdf'}
{'page': 9, 'source': 'data/state_of_the_union.pdf'}
