<a href="https://colab.research.google.com/github/VaishnaviOnPC/LangChain_RAG_with_Gemini/blob/main/RAG_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -Uq langchain_community langchain_core langchain-google-genai langchainhub chromadb langchain

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = 'YOUR_API_KEY'

In [None]:
os.environ['GEMINI_API_KEY'] = 'YOUR_API_KEY'

In [None]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.documents import Document
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI



In [None]:
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2024-07-07-hallucination/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_ = ("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()
"""
for i, doc in enumerate(docs):
    print(f"Document {i+1}:\n{doc.page_content}\n{'='*80}")
"""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
print(f"Number of splits: {len(splits)}")
"""
for i, doc in enumerate(splits):
    print(f"Chunk {i+1}:\n{doc.page_content}\n{'='*80}")
"""
embedding_model = GoogleGenerativeAIEmbeddings(google_api_key=os.environ.get("GEMINI_API_KEY"), model="models/embedding-001")

vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model)

retriever = vectorstore.as_retriever()

prompt = hub.pull("rlm/rag-prompt")

llm = ChatGoogleGenerativeAI(google_api_key=os.environ.get("GEMINI_API_KEY"), model="gemini-2.0-flash", temperature=0)

output_parser = StrOutputParser()

def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

result = rag_chain.invoke("What is Fine-tuning?")
print(f"Response: {result}")

Number of splits: 62
Response: Fine-tuning is a technique used to improve the capabilities of a pre-trained language model, such as instruction following. It involves training the model on a specific dataset to enhance its performance on a particular task. This process can also introduce new knowledge, though it may increase the model's tendency to hallucinate.


In [None]:
question = "what are my favorite places on Earth?"
document = "Anywhere with mountains and greenery is one of my favorite places on Earth."

In [None]:
import re

def estimate_gemini_tokens(text: str) -> int:
    """
    Estimates the number of tokens in a given text for Gemini models.

    This is a heuristic approach and will not be perfectly accurate.  It's
    intended for rough estimation only.  Relies on word counting and adds
    a fudge factor for sub-word tokens and punctuation.

    Args:
        text: The text string to estimate tokens for.

    Returns:
        An integer representing the estimated number of tokens.
    """
    if not text:
        return 0

    # 1. Word Count:  Start with a simple word count.  Use a robust word splitting.
    words = re.findall(r'\b\w+\b', text)  # Improved word splitting
    word_count = len(words)

    # 2. Fudge Factor:  Adjust for sub-word tokens and punctuation.
    #    This is the most uncertain part, and needs tuning.
    fudge_factor = 0.3  # Adjust this as needed.  Higher = more tokens estimated.
    estimated_tokens = int(word_count * (1 + fudge_factor))

    # 3. Handle extra whitespace and newlines (optional, may improve accuracy slightly)
    extra_chars = len(text) - len("".join(words))
    estimated_tokens += int(extra_chars / 4) # Assume roughly 4 chars per token

    return estimated_tokens

def test_estimation(text: str):
    """
    Tests the token estimation function and prints the result
    """
    estimated_count = estimate_gemini_tokens(text)
    print(f"Text: \"{text}\"")
    print(f"Estimated tokens: {estimated_count}")

def main():
    """
    Main function to run test cases
    """
    # Some test cases
    test_estimation(question)
    test_estimation(document)
    """
    test_estimation("This is a simple sentence.")
    test_estimation("Unbelievably complex, and very long-winded, isn't it?")
    test_estimation("你好世界")  # Test with non-ASCII characters
    test_estimation("1234567890") # Test with numbers
    test_estimation("   Extra   spaces   here   ") # Test with extra spaces
    test_estimation("Line 1\nLine 2\nLine 3") # Test with newlines
    test_estimation("") # Test with empty string
    test_estimation("a b c d e f g h i j k l m n o p q r s t u v w x y z") # Test with many single letters
    """

if __name__ == "__main__":
    main()


Text: "what are my favorite places on Earth?"
Estimated tokens: 10
Text: "Anywhere with mountains and greenery is one of my favorite places on Earth."
Estimated tokens: 19


In [None]:
embedding = GoogleGenerativeAIEmbeddings(google_api_key=os.environ.get("GEMINI_API_KEY"), model="models/embedding-001")
query_result = embedding.embed_query(question)
document_result = embedding.embed_query(document)
len(query_result), len(document_result)

(768, 768)

In [None]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.7928199970798053
