In [None]:
!pip install pypdf2 langchain_experimental faissdb langchain_google_genai langchain_google_vertexai



Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting faissdb
  Downloading faissdb-1.0.0.1-py3-none-any.whl.metadata (1.2 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-2.0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting langchain_google_vertexai
  Downloading langchain_google_vertexai-2.0.9-py3-none-any.whl.metadata (3.8 kB)
Collecting langchain-community<0.4.0,>=0.3.0 (from langchain_experimental)
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<0.4.0,>=0.3.28 (from langchain_experimental)
  Downloading langchain_core-0.3.28-py3-none-any.whl.metadata (6.3 kB)
Collecting gunicorn (from faissdb)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting uvicorn (from faissdb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Col

In [None]:
from PyPDF2 import PdfReader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_vertexai import VertexAIEmbeddings
import google.generativeai as genai
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter


# Steps to Implement the RAG
- 1.Define Data Source
- 2.Extract the text content
- 3.Preprocess Data(optional)
- 4.Create Chunks Using Suitable Chunking methods
- 5.Create Embeddings
- 6.Store that embedding in Vector database with metdata and ID
- 7.Initialize LLm and Retrievel Model
- 8.Generate Response Using Retrieved Documents

## 1.Define Data Source


In [None]:
Data_source = "/content/banking2.pdf"

## 2.Extract the text content

In [None]:
pdf_file_path = Data_source
text_file_path = "output.txt"

# Read the PDF file
reader = PdfReader(pdf_file_path)

# Extract text from each page
all_text = ""
for page in reader.pages:
    all_text += page.extract_text()

# Save the extracted text to a text file
with open(text_file_path, "w", encoding="utf-8") as text_file:
    text_file.write(all_text)

print(f"Text extracted and saved to {text_file_path}")



Text extracted and saved to output.txt


In [None]:
text_file_path = "/content/output.txt"
# Read the text from the file
with open(text_file_path, "r", encoding="utf-8") as file:
    extracted_text = file.read()

## 3.Preprocess Data

In [None]:
!pip install nltk

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data files
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
# Download the 'punkt_tab' data package
nltk.download('punkt_tab') # This line is added to download the required package

# File paths
text_file_path = "/content/output.txt"
preprocessed_file_path = "preprocessed_text.txt"

# Load extracted text
with open(text_file_path, "r", encoding="utf-8") as file:
    raw_text = file.read()

# 1. Remove unwanted characters and extra spaces
cleaned_text = re.sub(r'\s+', ' ', raw_text)  # Replace multiple spaces with a single space
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', cleaned_text)  # Remove special characters

# 2. Convert text to lowercase
cleaned_text = cleaned_text.lower()

# 3. Tokenization
# Sentence Tokenization
sentences = sent_tokenize(cleaned_text)

# Word Tokenization
words = word_tokenize(cleaned_text)

# 4. Remove Stopwords
stop_words = set(stopwords.words("english"))
filtered_words = [word for word in words if word not in stop_words]

# 5. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

# Combine back into a string
preprocessed_text = " ".join(lemmatized_words)

# Save preprocessed text to a file
with open(preprocessed_file_path, "w", encoding="utf-8") as file:
    file.write(preprocessed_text)

print(f"Preprocessed text saved to {preprocessed_file_path}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Preprocessed text saved to preprocessed_text.txt


# 4.Create Chunks Using Suitable Chunking methods

## what is Chunking ?
Chunking is a technique used to break down large pieces of information into smaller, more manageable units or “chunks.” This method leverages the brain’s natural ability to remember grouped information more easily than individual pieces.

## 4.1 Fixed-Size Chunking

Fixed-size chunking splits documents into chunks of a predefined size, typically by word count, token count, or character count.

####When to Use:
When you need a simple, straightforward approach and the document structure isn’t critical. It works well when processing smaller, less complex documents.

#### Advantages:

- Easy to implement.
- Consistent chunk sizes.
- Fast to compute.

#### Disadvantages:

- May break sentences or paragraphs, losing context.
- Not ideal for documents where maintaining meaning is important.






In [None]:
def fixed_size_chunk(text, max_words=100):
    words = text.split()
    return [' '.join(words[i:i + max_words]) for i in range(0, len(words),
    max_words)]

# Applying Fixed-Size Chunking
fixed_chunks = fixed_size_chunk(extracted_text)
for chunk in fixed_chunks:
    print(chunk, '\n---\n')

Banking BasicsTable of contents Introduction 4 What is a bank? 6 How do people start banks? 7 How did banking begin? 8 Why are there so many different types of banks? 11 How do I choose a bank? 13 What types of accounts do banks offer? 14 Is it difficult to open a bank account? 16 What happens to money after you deposit it? 18 What happens when you apply for a loan? 20 What are checks, and how do they work? 23 What is electronic banking? 25 Credit cards, debit cards, stored valued cards: What’s the difference? 27 Do 
---

banks keep large amounts of gold and silver in their vaults? 30 Why do banks fail? 31 Do you lose money if your bank fails? 34 Do you lose money if your bank is robbed? 35 How does the Federal Reserve fit into the U.S. banking system? 36 Resources for Everyone 40 Introduction Some young savers stash their cash in shoe boxes or jelly jars. Others use “piggy banks,” which today look more like spaceships or cartoon characters. In any case, the same problem arises. Sooner

##4.2 Sentence-Based Chunking
This method chunks text based on natural sentence boundaries. Each chunk contains a set number of sentences, preserving semantic units.

#### When to Use:
Maintaining coherent ideas is crucial, and splitting mid-sentence would result in losing meaning.

#### Advantages:

- Preserves sentence-level meaning.
- Better context preservation.

####Disadvantages:

- Uneven chunk sizes, as sentences vary in length.
- May exceed token limits in models when sentences are too long.

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

def sentence_chunk(text):
    doc = nlp(text)
    return [sent.text for sent in doc.sents]

# Applying Sentence-Based Chunking
sentence_chunks = sentence_chunk(extracted_text)
for chunk in sentence_chunks:
    print(chunk, '\n---\n')


Banking BasicsTable of contents  
  Introduction          4
  What is a bank?          
---

6
  How do people start banks?         
---

7
  How did banking begin?           
---

8
  Why are there so many different types of banks?       
---

11
  How do I choose a bank?         
---

13
  What types of accounts do banks offer?       
---

14 
  Is it difficult to open a bank account?       
---

16 
  What happens to money after you deposit it?       
---

18
  What happens when you apply for a loan?       
---

20
  What are checks, and how do they work?       
---

23
   
---

What is electronic banking?         
---

25
  Credit cards, debit cards, stored valued cards: What’s the difference?     
---

27
  Do banks keep large amounts of gold and silver in their vaults?     
---

30
  Why do banks fail?          
---

31
  Do you lose money if your bank fails?        
---

34
  Do you lose money if your bank is robbed?       
---

35
  How does the Federal Reserve fit into the U.S

## 4.3 Paragraph-Based Chunking
This strategy splits text based on paragraph boundaries, treating each paragraph as a chunk.

#### When to Use:
Best for structured documents like reports or essays where each paragraph contains a complete idea or argument.

#### Advantages:

- Natural document segmentation.
- Preserves larger context within a paragraph.

####Disadvantages:

- Paragraph lengths vary, leading to uneven chunk sizes.
- Long paragraphs may still exceed token limits.

In [None]:
def paragraph_chunk(text):
    paragraphs = text.split('\n\n')
    return paragraphs

# Applying Paragraph-Based Chunking
paragraph_chunks = paragraph_chunk(extracted_text)
for chunk in paragraph_chunks:
    print(chunk, '\n---\n')


Banking BasicsTable of contents  
  Introduction          4
  What is a bank?         6
  How do people start banks?        7
  How did banking begin?          8
  Why are there so many different types of banks?      11
  How do I choose a bank?        13
  What types of accounts do banks offer?      14 
  Is it difficult to open a bank account?      16 
  What happens to money after you deposit it?      18
  What happens when you apply for a loan?      20
  What are checks, and how do they work?      23
  What is electronic banking?        25
  Credit cards, debit cards, stored valued cards: What’s the difference?    27
  Do banks keep large amounts of gold and silver in their vaults?    30
  Why do banks fail?         31
  Do you lose money if your bank fails?       34
  Do you lose money if your bank is robbed?      35
  How does the Federal Reserve fit into the U.S. banking system?    36
  Resources for Everyone        40 Introduction
Some young savers stash their cash in shoe boxe

##4.4 Semantic-Based Chunking
Semantic chunking is a technique in Natural Language Processing (NLP) that involves dividing a text into smaller, meaningful segments based on their semantic content, or meaning. Unlike traditional chunking methods that rely on fixed sizes or structural elements like sentences or paragraphs, semantic chunking focuses on identifying natural breakpoints in the text where the topic or context shifts significantly.

#### When to Use:
When preserving the highest level of context is critical, such as in complex, technical documents.

#### Advantages:

- Contextually meaningful chunks.
- Captures semantic relationships between sentences.

#### Disadvantages:

- Requires advanced NLP models, which are computationally expensive.
- More complex to implement.

In [None]:
from langchain.schema import Document

# # Initialize the RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1024,
#     chunk_overlap=200,
#     length_function=len,
#     is_separator_regex=False
# )

# # Split documents using RecursiveCharacterTextSplitter
# documents = text_splitter.split_documents([Document(page_content=extracted_text)])

# Define your Google API key for embeddings
GOOGLE_API_KEY = "AIzaSyCjhtUjPxDAfJSZiVXVWrf6U8Vnjfmgiwg"

# Initialize embeddings
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=GOOGLE_API_KEY
)

# Apply semantic chunking to the documents obtained from the recursive splitter
semantic_chunker = SemanticChunker(embeddings)

docs = semantic_chunker.create_documents([extracted_text])

# Print the results of semantic chunking
for doc in docs:
    print(doc.page_content)
    print('---')  # Separator

print(len(docs))




Banking BasicsTable of contents  
  Introduction          4
  What is a bank? 6
  How do people start banks? 7
  How did banking begin? 8
  Why are there so many different types of banks? 11
  How do I choose a bank? 13
  What types of accounts do banks offer? 14 
  Is it difficult to open a bank account? 16 
  What happens to money after you deposit it? 18
  What happens when you apply for a loan? 20
  What are checks, and how do they work? 23
  What is electronic banking? 25
  Credit cards, debit cards, stored valued cards: What’s the difference? 27
  Do banks keep large amounts of gold and silver in their vaults? 30
  Why do banks fail? 31
  Do you lose money if your bank fails? 34
  Do you lose money if your bank is robbed? 35
  How does the Federal Reserve fit into the U.S.
---
banking system? 36
  Resources for Everyone        40 Introduction
Some young savers stash their cash in shoe boxes or jelly jars. Others use “piggy banks,” which 
today look more like spaceships or cartoon

In [None]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.2 MB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m0.7/1.2 MB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:

import os
import tiktoken



encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")  # Or appropriate model

def chunk_with_token_limit(text, max_tokens=1024, percentile=80):
    sentences = text.split(". ") #basic sentence splitting
    if len(sentences) <= 1:
        return sentences

    semantic_splitter = SemanticChunker(
        embeddings=embeddings,
        breakpoint_threshold_type="percentile",
        breakpoint_threshold_amount=percentile
    )
    semantic_chunks = semantic_splitter.create_documents([". ".join(sentences)])

    final_chunks = []
    for chunk in semantic_chunks:
        num_tokens = len(encoding.encode(chunk.page_content))

        if num_tokens > max_tokens:
            recursive_splitter = RecursiveCharacterTextSplitter(
                chunk_size=max_tokens,
                chunk_overlap=0,
                length_function=lambda x: len(encoding.encode(x))
            )
            sub_chunks = recursive_splitter.split_text(chunk.page_content)
            final_chunks.extend(sub_chunks)
        else:
            final_chunks.append(chunk.page_content)

    return final_chunks

chunks = chunk_with_token_limit(extracted_text)

for i, chunk in enumerate(chunks):
    num_tokens = len(encoding.encode(chunk))
    print(f"Chunk {i+1} ({num_tokens} tokens):\n{chunk}\n---")

Chunk 1 (29 tokens):
Banking BasicsTable of contents  
  Introduction          4
  What is a bank? 6
  How do people start banks?
---
Chunk 2 (195 tokens):
7
  How did banking begin? 8
  Why are there so many different types of banks? 11
  How do I choose a bank? 13
  What types of accounts do banks offer? 14 
  Is it difficult to open a bank account? 16 
  What happens to money after you deposit it? 18
  What happens when you apply for a loan? 20
  What are checks, and how do they work? 23
  What is electronic banking? 25
  Credit cards, debit cards, stored valued cards: What’s the difference? 27
  Do banks keep large amounts of gold and silver in their vaults? 30
  Why do banks fail? 31
  Do you lose money if your bank fails? 34
  Do you lose money if your bank is robbed? 35
  How does the Federal Reserve fit into the U.S.
---
Chunk 3 (4 tokens):
banking system?
---
Chunk 4 (44 tokens):
36
  Resources for Everyone        40 Introduction
Some young savers stash their cash in shoe boxe

#Different Break Points In Semantic Chunking

- In semantic chunking, "breakpoints" are the points in a text where the algorithm decides to split the text into separate chunks. These points are determined by analyzing the semantic similarity between different parts of the text, aiming to create chunks that are internally coherent and distinct from each other in meaning











## 1.   Percentile :
The default way to split is based on percentile. In this method, all differences between sentences are calculated, and then any difference greater than the X percentile is split.

## breakpoint_threshold_type="percentile":
This sets the method used to determine where to split the text into chunks. "percentile" means that the chunker will calculate the changes in semantic similarity between consecutive sentences and use a percentile of these changes as a threshold for creating breakpoints (splits).

## breakpoint_threshold_amount=80:
 This parameter works in conjunction with breakpoint_threshold_type="percentile". It specifies the percentile value to use as the threshold. In this case, it's set to 80.

## How it works:
 The SemanticChunker calculates the cosine similarity between the embeddings of consecutive sentences. It then calculates the difference between the similarity of a sentence to itself (which will always be 1) and its similarity to the next sentence. These differences are collected for the entire text.

The breakpoint_threshold_amount=80 means that the chunker will find the 80th percentile of these similarity differences. Any difference in similarity that is greater than this 80th percentile value will be considered a breakpoint, and the text will be split at that point.

## Effect of changing the value:

A higher percentile (e.g., 90) means the threshold is higher. This results in fewer splits and larger chunks because only very large changes in similarity will trigger a split.
A lower percentile (e.g., 50) means the threshold is lower. This results in more splits and smaller chunks because even smaller changes in similarity can trigger a split.

In [None]:
text_splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=70,
    min_chunk_size=512

)

docs = text_splitter.create_documents([extracted_text])
for doc in docs:
  print(doc)
  print("-----------------------------------------------------------")

print(len(docs))

page_content='Banking BasicsTable of contents  
  Introduction          4
  What is a bank? 6
  How do people start banks? 7
  How did banking begin? 8
  Why are there so many different types of banks? 11
  How do I choose a bank? 13
  What types of accounts do banks offer? 14 
  Is it difficult to open a bank account? 16 
  What happens to money after you deposit it? 18
  What happens when you apply for a loan? 20
  What are checks, and how do they work? 23
  What is electronic banking? 25
  Credit cards, debit cards, stored valued cards: What’s the difference? 27
  Do banks keep large amounts of gold and silver in their vaults? 30
  Why do banks fail? 31
  Do you lose money if your bank fails? 34
  Do you lose money if your bank is robbed? 35
  How does the Federal Reserve fit into the U.S.'
-----------------------------------------------------------
page_content='banking system? 36
  Resources for Everyone        40 Introduction
Some young savers stash their cash in shoe boxes or je




## 2.   Standard Deviation :
In this method, any difference greater than X standard deviations is split.



In [None]:
text_splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="standard_deviation",
    #breakpoint_threshold_amount=50,

)

docs = text_splitter.create_documents([extracted_text])
for doc in docs:
  print(doc)
  print("-----------------------------------------------------------")

print(len(docs))

page_content='Banking BasicsTable of contents  
  Introduction          4
  What is a bank? 6
  How do people start banks? 7
  How did banking begin? 8
  Why are there so many different types of banks? 11
  How do I choose a bank? 13
  What types of accounts do banks offer? 14 
  Is it difficult to open a bank account? 16 
  What happens to money after you deposit it? 18
  What happens when you apply for a loan? 20
  What are checks, and how do they work? 23
  What is electronic banking? 25
  Credit cards, debit cards, stored valued cards: What’s the difference? 27
  Do banks keep large amounts of gold and silver in their vaults? 30
  Why do banks fail? 31
  Do you lose money if your bank fails? 34
  Do you lose money if your bank is robbed? 35
  How does the Federal Reserve fit into the U.S. banking system? 36
  Resources for Everyone        40 Introduction
Some young savers stash their cash in shoe boxes or jelly jars. Others use “piggy banks,” which 
today look more like spaceships 



## 3.   Interquartile :
In this method, the interquartile distance is used to split chunks.


In [None]:
text_splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="interquartile",
    breakpoint_threshold_amount=10,
)

docs = text_splitter.create_documents([extracted_text])
for doc in docs:
  print(doc)
  print("-----------------------------------------------------------")

print(len(docs))

page_content='Banking BasicsTable of contents  
  Introduction          4
  What is a bank? 6
  How do people start banks? 7
  How did banking begin? 8
  Why are there so many different types of banks? 11
  How do I choose a bank? 13
  What types of accounts do banks offer? 14 
  Is it difficult to open a bank account? 16 
  What happens to money after you deposit it? 18
  What happens when you apply for a loan? 20
  What are checks, and how do they work? 23
  What is electronic banking? 25
  Credit cards, debit cards, stored valued cards: What’s the difference? 27
  Do banks keep large amounts of gold and silver in their vaults? 30
  Why do banks fail? 31
  Do you lose money if your bank fails? 34
  Do you lose money if your bank is robbed? 35
  How does the Federal Reserve fit into the U.S. banking system? 36
  Resources for Everyone        40 Introduction
Some young savers stash their cash in shoe boxes or jelly jars. Others use “piggy banks,” which 
today look more like spaceships 

## 4.   Gradient :
In this method, the gradient of distance is used to split chunks along with the percentile method. This method is useful when chunks are highly correlated with each other or specific to a domain e.g. legal or medical. The idea is to apply anomaly detection on gradient array so that the distribution become wider and easy to identify boundaries in highly semantic data.

In [None]:
text_splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="gradient",
    breakpoint_threshold_amount=90,
)

docs = text_splitter.create_documents([extracted_text])
for doc in docs:
  print(doc)
  print("-----------------------------------------------------------")

print(len(docs))

page_content='Banking BasicsTable of contents  
  Introduction          4
  What is a bank?'
-----------------------------------------------------------
page_content='6
  How do people start banks? 7
  How did banking begin? 8
  Why are there so many different types of banks? 11
  How do I choose a bank? 13
  What types of accounts do banks offer? 14 
  Is it difficult to open a bank account? 16 
  What happens to money after you deposit it? 18
  What happens when you apply for a loan? 20
  What are checks, and how do they work? 23
  What is electronic banking? 25
  Credit cards, debit cards, stored valued cards: What’s the difference? 27
  Do banks keep large amounts of gold and silver in their vaults? 30
  Why do banks fail? 31
  Do you lose money if your bank fails? 34
  Do you lose money if your bank is robbed?'
-----------------------------------------------------------
page_content='35
  How does the Federal Reserve fit into the U.S.'
---------------------------------------------

In [None]:
text_splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=90,
)

docs = text_splitter.create_documents([extracted_text])  # Extract chunks
print("Generated Chunks:")
for doc in docs:
    print(doc)
    print("-----------------------------------------------------------")

print(f"Total Chunks: {len(docs)}")

Generated Chunks:
page_content='Banking BasicsTable of contents  
  Introduction          4
  What is a bank? 6
  How do people start banks?'
-----------------------------------------------------------
page_content='7
  How did banking begin? 8
  Why are there so many different types of banks? 11
  How do I choose a bank? 13
  What types of accounts do banks offer? 14 
  Is it difficult to open a bank account? 16 
  What happens to money after you deposit it? 18
  What happens when you apply for a loan? 20
  What are checks, and how do they work? 23
  What is electronic banking? 25
  Credit cards, debit cards, stored valued cards: What’s the difference? 27
  Do banks keep large amounts of gold and silver in their vaults? 30
  Why do banks fail? 31
  Do you lose money if your bank fails? 34
  Do you lose money if your bank is robbed? 35
  How does the Federal Reserve fit into the U.S.'
-----------------------------------------------------------
page_content='banking system? 36
  Resourc

#RAG With chromadb and Gemini


## Initialize ChromaDB client

In [None]:
DB_PATH = "chroma_rag_data"
DB_NAME = "mental_health_chunks"

In [None]:
!pip install chromadb ragas



In [None]:
import chromadb

from chromadb import PersistentClient

# For handling errors in ChromaDB
from chromadb.errors import DuplicateIDError

def create_or_load_chroma_db(documents, path: str, name: str):
    chroma_client = chromadb.PersistentClient(path=path)

    # Get the list of collection names (already a list of strings)
    existing_collections = chroma_client.list_collections()

    # Check if the collection exists
    if name in existing_collections:
        collection = chroma_client.get_collection(name=name)
        print(f"Collection '{name}' already exists. Reusing it.")
    else:
        # Create a new collection if it doesn't exist
        collection = chroma_client.create_collection(name=name, embedding_function=embeddings)
        print(f"Created new collection '{name}'.")

    # Add documents to the collection
    for i, doc in enumerate(documents):
        document_text = doc.page_content if hasattr(doc, 'page_content') else str(doc)
        try:
            collection.add(
                documents=[document_text],
                ids=[str(i)],
                metadatas=[{"chunk_index": i}]
            )
        except chromadb.errors.DuplicateIDError:
            print(f"Document with ID {i} already exists. Skipping.")

    return collection


# Pass the chunks into ChromaDB
db = create_or_load_chroma_db(docs, DB_PATH, DB_NAME)



Collection 'mental_health_chunks' already exists. Reusing it.




#Query Processing and Response Generation

In [None]:
def get_relevant_passage(query: str, db, n_results: int = 5):
    results = db.query(query_texts=[query], n_results=n_results)
    return [passage for doc in results["documents"] for passage in doc]

def make_rag_prompt(query: str, relevant_passages: List[str]) -> str:
    escaped_passages = " ".join([
        passage.replace("'", "").replace('"', "").replace("\n", " ")
        for passage in relevant_passages
    ])
    prompt = (
    f"You are a professional, knowledgeable, and friendly banking assistant who is always here to help users with their banking needs. "
    f"You provide accurate and reliable information, guidance, and support related to banking services, products, and transactions. "
    f"You respond in a clear, concise, and approachable manner, ensuring users feel confident and supported in managing their finances. "
    f"You actively listen to the user's questions and concerns, offer practical advice, and provide tailored solutions to meet their needs. "
    f"Maintain a polite, helpful, and customer-centric attitude while addressing inquiries about accounts, loans, transactions, investments, and other banking services. "
    f"Your responses should always be professional, informative, and reassuring, like a trusted advisor. "
    f"Strictly don't answer if the question is not related to the Banking Industry"
    f"Here’s the context of the conversation:\n\n"
    f"QUESTION: '{query}'\n"
    f"PASSAGE(S): '{escaped_passages}'\n"
    f"ANSWER:"
)

    return prompt

def generate_answer(prompt: str) -> str:
    genai_api_key = GOOGLE_API_KEY  # Ensure this is configured
    genai.configure(api_key=genai_api_key)
    model = genai.GenerativeModel("gemini-pro")
    result = model.generate_content(prompt)
    return result.text

# --- STEP 5: Interactive Query Handling ---
def process_query_and_generate_answer():
    query = input("Please enter your query: ")
    relevant_passages = get_relevant_passage(query, db)
    if not relevant_passages:
        print("No relevant passages found.")
        return
    prompt = make_rag_prompt(query, relevant_passages)
    answer = generate_answer(prompt)
    print("Generated Answer:", answer)



In [None]:
# --- RUN INTERFACE ---
process_query_and_generate_answer()

Please enter your query: who is our Prime minister
Generated Answer: I apologize, but the provided text does not contain information about the Prime Minister. I can only answer questions related to banking services and products.


In [None]:
!pip install --upgrade ragas # upgrades ragas package to the latest version
!pip install --upgrade datasets

!pip install langchain_google_genai # Install if not already installed

from langchain_google_genai import GoogleGenerativeAI



In [None]:
from typing import List
import ragas
from ragas.metrics import (
    faithfulness,
    context_recall,
    answer_relevancy,
    context_precision,
)
from datasets import Dataset
import pandas as pd
from ragas.llms import LangchainLLMWrapper
from langchain_google_genai import ChatGoogleGenerativeAI
import google.generativeai as genai

def get_relevant_passage(query: str, db, n_results: int = 5):
    results = db.query(query_texts=[query], n_results=n_results)
    return [passage for doc in results["documents"] for passage in doc]

def make_rag_prompt(query: str, relevant_passages: List[str]) -> str:
    escaped_passages = " ".join([
        passage.replace("'", "").replace('"', "").replace("\n", " ")
        for passage in relevant_passages
    ])
    prompt = (
        f"You are a professional, knowledgeable, and friendly banking assistant who is always here to help users with their banking needs. "
        f"You provide accurate and reliable information, guidance, and support related to banking services, products, and transactions. "
        f"You respond in a clear, concise, and approachable manner, ensuring users feel confident and supported in managing their finances. "
        f"You actively listen to the user's questions and concerns, offer practical advice, and provide tailored solutions to meet their needs. "
        f"Maintain a polite, helpful, and customer-centric attitude while addressing inquiries about accounts, loans, transactions, investments, and other banking services. "
        f"Your responses should always be professional, informative, and reassuring, like a trusted advisor. "
        f"Strictly don't answer if the question is not related to the Banking Industry. "
        f"Here's the context of the conversation:\n\n"
        f"QUESTION: '{query}'\n"
        f"PASSAGE(S): '{escaped_passages}'\n"
        f"ANSWER:"
    )
    return prompt

def generate_answer(prompt: str, api_key: str) -> str:
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-pro")
    result = model.generate_content(prompt)
    return result.text

class RAGEvaluator:
    def __init__(self, api_key: str):
        # Initialize the LLM wrapper with proper model configuration
        llm = ChatGoogleGenerativeAI(
            google_api_key=api_key,
            model="gemini-pro",
            temperature=0,
            max_output_tokens=512,
        )
        self.llm = LangchainLLMWrapper(llm)

        # Initialize metrics
        self.metrics = {
            'faithfulness': faithfulness,
            'context_recall': context_recall,
            'context_precision': context_precision,
            'answer_relevancy': answer_relevancy,
        }
        # Set LLM for each metric
        for metric in self.metrics.values():
            metric.llm = self.llm

    def prepare_evaluation_dataset(self, queries, contexts, answers):
        """
        Prepare dataset for RAGAS evaluation
        """
        eval_data = {
            'question': queries,
            'contexts': [[ctx] for ctx in contexts],
            'answer': answers,
        }
        return Dataset.from_dict(eval_data)

    def evaluate(self, eval_dataset):
        """
        Evaluate RAG system using RAGAS metrics
        """
        results = {}
        for metric_name, metric in self.metrics.items():
            score = metric.score(eval_dataset)
            results[metric_name] = score
        return results

def process_query_and_generate_answer(query: str = None, evaluate: bool = False, api_key: str = None, db = None):
    if query is None:
        query = input("Please enter your query: ")

    if api_key is None:
        raise ValueError("API key is required")

    if db is None:
        raise ValueError("Database connection is required")

    # Get relevant passages
    relevant_passages = get_relevant_passage(query, db)
    if not relevant_passages:
        print("No relevant passages found.")
        return None, None, None

    # Generate answer
    prompt = make_rag_prompt(query, relevant_passages)
    answer = generate_answer(prompt, api_key)
    print("Generated Answer:", answer)

    if evaluate:
        # Initialize evaluator
        evaluator = RAGEvaluator(api_key)

        # Prepare evaluation dataset
        eval_dataset = evaluator.prepare_evaluation_dataset(
            queries=[query],
            contexts=[" ".join(relevant_passages)],
            answers=[answer],
        )

        # Run evaluation
        evaluation_results = evaluator.evaluate(eval_dataset)

        print("\nEvaluation Results:")
        for metric, score in evaluation_results.items():
            print(f"{metric}: {score:.3f}")

        return answer, relevant_passages, evaluation_results

    return answer, relevant_passages, None

# Example usage with evaluation
if __name__ == "__main__":
    GOOGLE_API_KEY = "AIzaSyCjhtUjPxDAfJSZiVXVWrf6U8Vnjfmgiwg"  # Replace with your actual API key
    query = "What are the different types of bank accounts?"
    answer, contexts, eval_results = process_query_and_generate_answer(
        query,
        evaluate=True,
        api_key=GOOGLE_API_KEY,
        db=db  # Make sure to initialize your database connection
    )

Generated Answer: Certainly! Here are the different types of bank accounts that are commonly offered:

* **Savings accounts:** Designed for keeping your money safe while earning interest. They offer easy access to your funds.

* **Certificates of deposit (CDs):** Similar to savings accounts but with a fixed term and interest rate. You cannot withdraw your funds before the term ends without facing penalties.

* **Individual retirement accounts (IRAs):** Special savings accounts designed for retirement planning, offering tax benefits.

* **Checking accounts:** Allow you to make payments and manage your daily finances conveniently through checks or debit cards.

* **Money market deposit accounts (MMDAs):** Combine features of checking and savings accounts, offering higher interest rates and check-writing privileges while requiring a higher minimum balance.

Different banks may offer additional account types or variations tailored to specific needs. It's important to research and compare o

  score = metric.score(eval_dataset)


KeyError: "Column response not in the dataset. Current columns in the dataset: ['question', 'contexts', 'answer']"