In [9]:
import os
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader, UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.document_loaders import DirectoryLoader
#importing from keys file
import keys

# in the above code, we integrated the API key as an os environment variable. this method of defining the API key for the program is optional
# in the above libraries, Chroma, embeddings, text splitter are used. 
# embeddings are the vectors constructed from semantings in the documents that are used in order to query a result based on a similarity search. when we want to query information
# for document loaders, I am using Py PDF loader.
# chroma is a vector databse service. the vector databese in use is a chroma db one. 
# for the cloud based solution, we can either move the chroma db to a cloud platform, or use another service like oinecone or weviate. 
# chroma is highly scalable and open source and since we are using it it is perefered to keep its use. 

# this file will be used to add data to the vector databse. this includes CSVs. CSVs need to be convereted to PDFs before going into a database. 
# the CSV file notebook consists of the same code as this and can be ignored. I am keeping it around in case we want to load CSV type files into a different database directory.

OPENAI_API_KEY = keys.OPENAI_API_KEY
PINECONE_API_KEY = keys.PINECONE_API_KEY
PINECONE_API_ENV = keys.PINECONE_API_ENV


In [4]:
# we will load the pdf file of choice into the loader vaiable
# we will load the data into a data variable

#loader = PyPDFLoader('CrowdStrike2023GlobalThreatReport.pdf')
loader = PyPDFLoader('.pdf')
data = loader.load()

ValueError: File path .pdf is not a valid file or url

In [156]:
# getting a view of how long our data is

print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 307 document(s) in your data
There are 181 characters in your document


In [157]:
# there is a limit to how much data we can store at once
# the code below splits it into sizeable chuncks that can be loaded into the database one at a time

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
print(f'now you have {len(texts)} documents')

now you have 630 documents


In [158]:
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

In [159]:
# choosing the database folder to store our vector data
persist_directory = 'db2'

In [11]:
# creating an embeddings variable to make the vectors out of the text
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [161]:
# we are creating a docsearch variable that will hold our data. it is also worth noting that in this process the vectors go into the databse we chose
# docsearch.persist will persist the data in the databse. 

docsearch = Chroma.from_texts([t.page_content for t in texts], embeddings, persist_directory=persist_directory)
docsearch.persist()
