In [None]:
from dotenv import load_dotenv
load_dotenv()

In [11]:
import os
KEY=os.getenv("OPENAI_API_KEY")

In [14]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.llms import OpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

In [23]:
loader = DirectoryLoader(
    "new_articles", 
    glob="*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={"encoding": "utf-8"}
)

In [24]:
document=loader.load()

In [None]:
document

In [26]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text = text_splitter.split_documents(document)

In [None]:
text[0].page_content

In [None]:
text[1].page_content

In [None]:
len(text)

# Creating DB

In [31]:
from langchain import embeddings

In [32]:
persist_directory = 'db'

In [None]:
embedding = OpenAIEmbeddings()

In [None]:
vectordb = Chroma.from_documents(documents=text, embedding=embedding, persist_directory=persist_directory)

In [None]:
# persiste the DB to disk
vectordb.persist()

In [None]:
vectordb=None

In [None]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [None]:
vectordb

### Make a retriver

In [None]:
retriever = vectordb.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("How much money did Microsoft raise?")

In [None]:
print(docs[0].page_content)

In [None]:
len(docs)

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [None]:
retriever.search_kwargs

In [None]:
docs2 = retriever.get_relevant_documents("How much money did Microsoft raise?")

In [None]:
len(docs2)

In [None]:
docs2

### Make a Chain

In [None]:
from langchain.chains import RetrievalQA

In [None]:
llm=OpenAI()

In [None]:
llm

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Microsoft raise?"

In [None]:
llm_response = qa_chain(query)

In [None]:
llm_response

In [None]:
process_llm_response(llm_response)

### Delete the DB

In [None]:
!zip -r db.zip ./db

In [None]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

In [None]:
# delete the directory
!rm -rf db/