In [3]:
import os
os.environ["GEMINI_API_KEY"]="AIzaSyCxd0Gt8gZxFcot6urLX0KJtKxz3rOuQV8"

In [4]:
from pypdf import PdfReader

def load_pdf(file_path):
    """
    Reads the text content from a PDF file and returns it as a single string.

    Parameters:
    - file_path (str): The file path to the PDF file.

    Returns:
    - str: The concatenated text content of all pages in the PDF.
    """
    # Logic to read pdf
    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

# replace the path with your file path
pdf_text = load_pdf(file_path="./final.pdf")

In [12]:
import re
def split_text(text: str):
    """
    Splits a text string into a list of non-empty substrings based on the specified pattern.
    The "\n \n" pattern will split the document para by para
    Parameters:
    - text (str): The input text to be split.

    Returns:
    - List[str]: A list containing non-empty substrings obtained by splitting the input text.

    """
    split_text = re.split('\n \n', text)
    for i in range(len(split_text)):
        split_text[i] = split_text[i].replace("\n", "")

    return [i for i in split_text if i != ""]

chunked_text = split_text(text=pdf_text)
len(chunked_text)

2

In [6]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

class GeminiEmbeddingFunction(EmbeddingFunction):
    """
    Custom embedding function using the Gemini AI API for document retrieval.

    This class extends the EmbeddingFunction class and implements the __call__ method
    to generate embeddings for a given set of documents using the Gemini AI API.

    Parameters:
    - input (Documents): A collection of documents to be embedded.

    Returns:
    - Embeddings: Embeddings generated for the input documents.
    """
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import chromadb
from typing import List
def create_chroma_db(documents:List, path:str, name:str):
    """
    Creates a Chroma database using the provided documents, path, and collection name.

    Parameters:
    - documents: An iterable of documents to be added to the Chroma database.
    - path (str): The path where the Chroma database will be stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - Tuple[chromadb.Collection, str]: A tuple containing the created Chroma Collection and its name.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name

db,name =create_chroma_db(documents=chunked_text, 
                          path="./", #replace with your path
                          name="rag_experiment2")

UniqueConstraintError: Collection rag_experiment2 already exists

In [None]:
def load_chroma_collection(path, name):
    """
    Loads an existing Chroma collection from the specified path with the given name.

    Parameters:
    - path (str): The path where the Chroma database is stored.
    - name (str): The name of the collection within the Chroma database.

    Returns:
    - chromadb.Collection: The loaded Chroma Collection.
    """
    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

db=load_chroma_collection(path="./", name="rag_experiment2")
db

Collection(name=rag_experiment2)

In [13]:
def upload(documents:List, path:str, name:str):
    chroma_client = chromadb.PersistentClient(path=path)
    collection = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())
    for i, d in enumerate(documents):
        collection.add(documents=d, ids=str(i))
    
pdf_text = load_pdf(file_path="./Detailed_Summary_of_Tax_Filing_Process_for_Freelancers_in_Pakistan.pdf")
print(pdf_text)
chunked_text = split_text(text=pdf_text)
upload(path='./', name="rag_experiment2", documents=chunked_text)
chunked_text

Detailed Summary of Tax Filing Process for Freelancers in Pakistan  
This guide details the steps involved in filing taxes in Pakistan as a freelancer using the Iris portal. It 
highlights the importance of PSEB registration for freelancers in the IT sector and offers alternative 
methods to locate previously withheld taxes . 
 
Steps:  
1. Access and Login:  Begin by accessing and logging in to the Iris portal, the online tax filing system 
for Pakistan.  
2. Select Tax Return and Period:  Navigate to the section for filing tax returns and choose "normal 
return." Here, you'll also specify the tax year applicable to your filing.  
3. Declare Freelancer Income:  Locate the section dedicated to "foreign sources." While it initially 
mentions "Agriculture," freelancers should select "foreign sources" within this section. Under 
"Foreign Other Sources Income/Loss," enter your total annual income from freelancing platf orms 
like Upwork.  
4. Calculate Tax Based on PSEB Registration:  This

Insert of existing embedding ID: 0
Add of existing embedding ID: 0
Insert of existing embedding ID: 1
Add of existing embedding ID: 1


['Detailed Summary of Tax Filing Process for Freelancers in Pakistan  This guide details the steps involved in filing taxes in Pakistan as a freelancer using the Iris portal. It highlights the importance of PSEB registration for freelancers in the IT sector and offers alternative methods to locate previously withheld taxes . ',
 'Steps:  1. Access and Login:  Begin by accessing and logging in to the Iris portal, the online tax filing system for Pakistan.  2. Select Tax Return and Period:  Navigate to the section for filing tax returns and choose "normal return." Here, you\'ll also specify the tax year applicable to your filing.  3. Declare Freelancer Income:  Locate the section dedicated to "foreign sources." While it initially mentions "Agriculture," freelancers should select "foreign sources" within this section. Under "Foreign Other Sources Income/Loss," enter your total annual income from freelancing platf orms like Upwork.  4. Calculate Tax Based on PSEB Registration:  This step i

In [None]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

#Example usage
relevant_text = get_relevant_passage(query="i am a freelancer on upwork earning 10k dollars annually, how much tax would i pay annually? and what are the steps to file tax in pakistan",db=db,n_results=5)
relevant_text

Number of requested results 5 is greater than number of elements in index 4, updating n_results = 4


['By following this comprehensive guide, freelancers and IT professionals can navigate the intricacies of tax filing with confidence and accuracy, ensuring compliance with all relevant regulations and minimizing the risk of errors or discrepancies.   ',
 "Tax filing can be a complex process, especially for freelancers and IT professionals. To ensure accuracy  and compliance, let's delve into the step -by-step procedure within the Iris portal. Firstly, upon accessing  the portal, locate the Declaration Tab. Here, users must select 'Return/Statement' and enter the specific tax period year applicable to their situation. For freelancers, the next crucial step involves navigating to the 'Foreign Sources' section. Within thi s subsection, declare all relevant income under 'Foreign Other Sources Income/Loss.' It's imperative to differentiate between tax rates based on PSEB (Pakistan Software Export Board) registration status. Registered freelancers enjoy a reduced tax rate of 0 .25%, while un

In [None]:
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""
  PASSAGE: '{relevant_passage}'
  Above is the paragraph about tax for freelancers. User will possibly ask you to check how much tax he or she has to pay, calculate it by getting percentage from passage and make it possible to tell the user tax amount. Person can also seek guidance regarding filling filer form
  QUESTION: '{query}'

  ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [None]:
import google.generativeai as genai
def generate_answer(prompt):
    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-1.0-pro-latest')
    answer = model.generate_content(prompt)
    return answer.text

In [None]:
def generate_answers(db,query):
    #retrieve top 3 relevant text chunks
    relevant_text = get_relevant_passage(query,db,n_results=2)
    print(relevant_text)
    prompt = make_rag_prompt(query, 
                             relevant_passage="".join(relevant_text)) # joining the relevant chunks to create a single passage
    answer = generate_answer(prompt)

    return answer

In [None]:
db=load_chroma_collection(path="./", #replace with path of your persistent directory
                          name="rag_experiment2") #replace with the collection name
query = "i am a freelancer on upwork earning 10,000 dollars annually, how much tax would i pay annually if im not registered with PSEB? and what are the steps to file tax in pakistan"
answer = generate_answers(db,query=query)
print(answer)

["Tax filing can be a complex process, especially for freelancers and IT professionals. To ensure accuracy  and compliance, let's delve into the step -by-step procedure within the Iris portal. Firstly, upon accessing  the portal, locate the Declaration Tab. Here, users must select 'Return/Statement' and enter the specific tax period year applicable to their situation. For freelancers, the next crucial step involves navigating to the 'Foreign Sources' section. Within thi s subsection, declare all relevant income under 'Foreign Other Sources Income/Loss.' It's imperative to differentiate between tax rates based on PSEB (Pakistan Software Export Board) registration status. Registered freelancers enjoy a reduced tax rate of 0 .25%, while unregistered individuals face a standard rate of 1%, effective from 2024 to 2026.  ", 'By following this comprehensive guide, freelancers and IT professionals can navigate the intricacies of tax filing with confidence and accuracy, ensuring compliance with