<a href="https://colab.research.google.com/github/RajivDalal/GenAI-Practice/blob/main/ChromaDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [1]:
!pip -q install chromadb google-genai langchain_community langchain-chroma langchain_google_genai tiktoken

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m61.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m

In [2]:
!wget -q https://www.dropbox.com/s/vs6ocyvpzzncvwh/new_articles.zip

In [3]:
!unzip -q new_articles.zip -d new_articles

# Import Langchain Libraries

In [4]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

# Load Data

In [5]:
loader = DirectoryLoader("/content/new_articles/", glob = "./*.txt", loader_cls= TextLoader)

In [6]:
doc = loader.load()

# Split Data into chunks

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
text = text_splitter.split_documents(doc)

# Creating DB

In [8]:
from google.colab import userdata
GEMINI_API = userdata.get('GEMINI_API')

In [9]:
persist_directory = "db"


embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=GEMINI_API)

vectordb = Chroma.from_documents(documents=text, embedding=embeddings, persist_directory=persist_directory)

# Data Retrieval

In [10]:
retriever = vectordb.as_retriever()

In [11]:
docs = retriever.get_relevant_documents("How much money did Microsoft raise?")

  docs = retriever.get_relevant_documents("How much money did Microsoft raise?")


In [12]:
docs[2]

Document(id='27bf5e7d-a0bb-4354-85f1-19ddc7f6ffaa', metadata={'source': '/content/new_articles/05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt'}, page_content='March 1, 2023\n\nOpenAI makes another move toward monetization by launching a paid API for ChatGPT. Instacart, Snap (Snapchat’s parent company) and Quizlet are among its initial customers.\n\nFebruary 7, 2023\n\nAt a press event in Redmond, Washington, Microsoft announced its long-rumored integration of OpenAI’s GPT-4 model into Bing, providing a ChatGPT-like experience within the search engine. The announcement spurred a 10x increase in new downloads for Bing globally, indicating a sizable consumer demand for new AI experiences.\n\nOther companies beyond Microsoft joined in on the AI craze by implementing ChatGPT, including OkCupid, Kaito, Snapchat and Discord — putting the pressure on Big Tech’s AI initiatives, like Google.\n\nFebruary 1, 2023\n\nAfter ChatGPT took the internet by storm, OpenAI launc

In [13]:
len(docs)

4

In [14]:
# How many results
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

In [15]:
retriever.search_type

'similarity'

# Creating a chain

In [17]:
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAI

In [23]:
## Helper Function to Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [25]:
llm = GoogleGenerativeAI(model="gemini-pro", google_api_key=GEMINI_API)

In [26]:
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [30]:
query = "What products did OpenAI launch?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

ChatGPT, ChatGPT Plus, and Plugins for ChatGPT


Sources:
/content/new_articles/05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt
/content/new_articles/05-03-chatgpt-everything-you-need-to-know-about-the-ai-powered-chatbot.txt
