In [None]:
%pip show chromadb

In [None]:
# !wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [None]:
# !unzip -q new_articles.zip -d new_articles

In [None]:
import os

openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
  raise ValueError("Please set the OPENAI_API_KEY environment variable.")
else:
  print("OpenAI API key is set.")
  print(openai_api_key)  # Print only the first and last 4 characters for security

OpenAI API key is set.
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


In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

In [None]:
loader = DirectoryLoader(
    "./new_articles",
    glob="**/*.txt",
    loader_cls=TextLoader,
)

documents = loader.load()
documents

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(documents)

In [None]:
text

In [None]:
print(f"Number of documents: {len(text)}")
print(f"First document: {text[0].page_content[:100]}...")

## Creating DB

In [None]:

from langchain import embeddings
persist_directory = 'db'

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=text,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [None]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

In [None]:
retriever = vectordb.as_retriever()
docs = retriever.get_relevant_documents("How much money did Microsoft raise?")
print(docs[0].page_content)
print(len(docs))

In [None]:
docs

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 2})
print(retriever.search_type)
print(retriever.search_kwargs)

## Make a chain

In [None]:
# from langchain_openai import OpenAI

llm = OpenAI(openai_api_key=openai_api_key)
print(llm)

In [None]:
from langchain.chains import RetrievalQA

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources
def process_llm_response(llm_response):
  print(llm_response['result'])
  print('\n\nSources:')
  for source in llm_response["source_documents"]:
    print(source.metadata['source'])

In [None]:
# full example
query = "How much money did Microsoft raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

In [None]:
# break it down
query = "What is machine learning?"
llm_response = qa_chain(query)
process_llm_response(llm_response)