In [None]:
print("Welcome to Chroma DB session")

## Import important Library and some docoment for practice

In [None]:
!pip -q install chromadb openai langchain tiktoken

In [None]:
!pip show chromadb

In [None]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [None]:
!unzip -q new_articles.zip -d new_articles

## Setting up Environment

In [None]:
import os

os.environ['OPENAI_API_KEY'] = "sushil-xxxxxxxxxxxxxxxxxxxxxxxx"

## Import some libraries

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

## Load data

In [None]:
loader = DirectoryLoader("/chroma_content/news_articles/", glob = "./*.txt", loader_cls= TextLoader)

In [None]:
document = loader.load()

In [None]:
document

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(document)

In [None]:
text

In [None]:
len(text)

In [None]:
text[1]

In [None]:
text[2]

## Creating DB

In [None]:
from langchain import embeddings
persist_directory = 'db'

embedding = OpenAIEmbeddings()

vectordb = Chroma.from_documents(documents=text,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

In [None]:
# persiste the db to disk
vectordb.persist()
vectordb = None

In [None]:
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
                  embedding_function=embedding)

## Make a retriever

In [None]:
retriever = vectordb.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("How  big investment microsoft raise?")

In [None]:
len(docs)

In [None]:
docs

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [None]:
retriever.search_type

In [None]:
retriever.search_kwargs

## Make a chain

In [None]:
from langchain.chains import RetrievalQA

In [None]:
llm=OpenAI()

In [None]:
llm

In [None]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## proper document retevial function
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
# full example
query = "money raised by microsoft?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

## Deleteing the DB


In [None]:
!zip -r db.zip ./db

In [None]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()

# delete the directory of db
!rm -rf db/