In [284]:
#!pip install pillow pdf2image python-docx numpy



In [1144]:
import os
import uuid
from PIL import Image
from pdf2image import convert_from_path
from docx import Document
import numpy as np

def file_to_images(file_path, output_dir="output_images"):
    """
    Convert any supported file into images and save them to a directory.

    Args:
        file_path (str): Path to the input file.
        output_dir (str): Directory to save the output images.
    
    Returns:
        list: List of image file paths.
    """
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    file_extension = file_path.split('.')[-1].lower()
    output_files = []

    try:
        if file_extension == "pdf":
            # Convert PDF to images
            images = convert_from_path(file_path, dpi=300)
            for i, img in enumerate(images):
                uid = str(uuid.uuid4()).split("-")[-1]
                img_path = os.path.join(output_dir, f"page_{uid}.jpg")
                img.save(img_path, "JPEG")
                output_files.append(img_path)
        
        elif file_extension in ["doc", "docx"]:
            # Convert Word document to images
            doc = Document(file_path)
            text_content = "\n".join([p.text for p in doc.paragraphs])
            
            # Create an image for each page of text (approximation)
            img = Image.new("RGB", (800, 600), "white")
            from PIL import ImageDraw
            draw = ImageDraw.Draw(img)
            draw.multiline_text((10, 10), text_content, fill="black")
            img_path = os.path.join(output_dir, "document.jpg")
            img.save(img_path, "JPEG")
            output_files.append(img_path)
        
        elif file_extension in ["jpg", "jpeg", "png", "bmp", "tiff"]:
            # Handle image formats
            img = Image.open(file_path)
            img_path = os.path.join(output_dir, os.path.basename(file_path))
            img.save(img_path)  # Save the preprocessed image
            output_files.append(img_path)
        
        else:
            raise ValueError("Unsupported file type. Only PDF, DOC, DOCX, and image files are supported.")
    
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    
    return output_files



In [1145]:
input_directory = "input_dir"
extensions = (".pdf", ".docx", ".txt", "jpg", "jpeg", "png", "bmp", "tiff")
def load_files(input_directory,domain_name):
    domain_specific_images = {}
    domain_specific_images[domain_name]={}
    for file in os.listdir(input_directory):
        print(file,"file going to load....")
        if file.endswith(extensions):
            file_path = os.path.join(input_directory, file)
            img=file_to_images(file_path)
            domain_specific_images[domain_name][file] = img
    return domain_specific_images        

In [1146]:
import warnings
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
domain_specific_images=load_files(input_directory,"finance")

lunch1.pdf file going to load....


ola2.png file going to load....
lunch2.pdf file going to load....
ola1.png file going to load....


In [1147]:
domain_specific_images

{'finance': {'lunch1.pdf': ['output_images/page_dd9185383e19.jpg'],
  'ola2.png': ['output_images/ola2.png'],
  'lunch2.pdf': ['output_images/page_ddb59e5e5ab7.jpg'],
  'ola1.png': ['output_images/ola1.png']}}

In [1148]:
from groq import Groq
import base64
import time

client = Groq()


# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')





def perform_OCR(domain_specific_images):
    text_from_images = {}
    text_from_images["metadata"]={}
    text_from_images["images"]={}
    for domain, files in domain_specific_images.items():
        for file in files.keys():
            print(file)
            for image in domain_specific_images[domain][file]:
                print(image)
                base64_image = encode_image(image)
                chat_completion = client.chat.completions.create(
                messages = [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": "You are an OCR extraction tool."
                            },
                            {
                                "type": "text",
                                "text": "Extract every character, word, and symbol exactly as it appears in the image."
                            },
                            {
                                "type": "text",
                                "text": "Do not refer to any historical memory or prior conversations—this is an independent chat."
                            },
                            {
                                "type": "text",
                                "text": "Do not repeat any values unless they appear multiple times in the image."
                            },
                            {
                                "type": "text",
                                "text": "Retain the original structure, including line breaks, paragraphs, tables, headers, footers, and any other formatting."
                            },
                            {
                                "type": "text",
                                "text": "Avoid any modifications, additions, or interpretations."
                            },
                            {
                                "type": "text",
                                "text": "Provide only the raw, unaltered text as shown, without repeating or omitting any information unless explicitly required by the image."
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}",
                                },
                            },
                        ],
                    }
                ],

                model="llama-3.2-11b-vision-preview",
                temperature=0,  # Set temperature to 0 for more deterministic responses
                top_p=1,
                stream=False,
                stop=None)


                print("image description -----------------> ",chat_completion.choices[0].message.content)
                time.sleep(50)
                text_from_images["metadata"]["domain"] = domain
                text_from_images["images"][image] = chat_completion.choices[0].message.content
                  
    return text_from_images           



In [1149]:
text_from_img=perform_OCR(domain_specific_images)

lunch1.pdf
output_images/page_dd9185383e19.jpg
image description ----------------->  The image shows a screenshot of an order confirmation page on a mobile device, with the text "Order Details" at the top. The page is divided into sections, including "Your Order", "Grand Total", and "Order Details".

*   **Order Details**
    *   A.K. Juice Center
        *   Nariman Point, Mumbai
    *   Download invoice
    *   Download summary
*   **This order was delivered**
    *   This order was delivered
*   **Your Order**
    *   Chikoo Milkshake
        *   Quantity: Full
        *   Item total: ₹140
        *   Taxes: ₹0.08
        *   Delivery charge: ₹0.08
        *   Donate ₹5 to Feeding India: ₹5.00
        *   Platform fee: ₹0.00
    *   Grand Total: ₹197.08
    *   Your total savings: ₹15
*   **Order Details**
    *   ORDER NUMBER: 6113265046
    *   PAYMENT: Paid: Using Upi (₹197.08)
    *   DATE: August 22, 2024 at 01:05 PM
    *   DELIVERY TO: Room number: 701, Envision hotel, arthus

In [1150]:
###clean the documents/page content if required

"""Description 
    Clean the input text by removing unnecessary whitespace, non-alphanumeric characters, 
    and trimming leading/trailing spaces.

    This function performs the following operations:
    1. Replaces multiple consecutive whitespace characters (spaces, newlines, tabs) 
       with a single space.
    2. Removes all non-alphanumeric characters, retaining only letters (a-z, A-Z), 
       numbers (0-9), and spaces.
    3. Strips leading and trailing spaces from the text.
"""

import re  # Import the regular expression module for text processing



def clean_text(text):
    # Remove extra whitespaces, newlines, and tabs by replacing all whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)
    
    # Define a regex pattern to match non-alphanumeric characters, but exclude valid symbols in numbers
    # This will match any character except letters, digits, spaces, periods, commas, slashes, dashes, and hashes
    text = re.sub(r'[^a-zA-Z0-9\s.,/#-]', '', text)
    
    # Remove leading and trailing spaces from the text
    text = text.strip()
    
    return text




# Clean the text from the extracted images
def extract_clean_text(text_from_images):
    for key,value in text_from_images["images"].items():
        # Clean the text
        text_from_images["images"][key] = clean_text(value)
    return text_from_images    
    

In [1151]:
text_from_images=extract_clean_text(text_from_img)
text_from_images

{'metadata': {'domain': 'finance'},
 'images': {'output_images/page_dd9185383e19.jpg': 'The image shows a screenshot of an order confirmation page on a mobile device, with the text Order Details at the top. The page is divided into sections, including Your Order, Grand Total, and Order Details.  Order Details  A.K. Juice Center  Nariman Point, Mumbai  Download invoice  Download summary  This order was delivered  This order was delivered  Your Order  Chikoo Milkshake  Quantity Full  Item total 140  Taxes 0.08  Delivery charge 0.08  Donate 5 to Feeding India 5.00  Platform fee 0.00  Grand Total 197.08  Your total savings 15  Order Details  ORDER NUMBER 6113265046  PAYMENT Paid Using Upi 197.08  DATE August 22, 2024 at 0105 PM  DELIVERY TO Room number 701, Envision hotel, arthus pondar road, near the metro station, pondar road, Mumbai, Maharashtra, India  DELIVERED TO Room number 701, Envision hotel, arthus pondar road, near the metro station, pondar road, Mumbai, Maharashtra, India The o

In [1152]:
import chromadb
from sentence_transformers import SentenceTransformer
import os


# Initialize Chroma client
client = chromadb.Client()

# Set up or create a collection in Chroma
collection_name = "image_text_extraction"
def load_collection(collection_name):
   collection = client.create_collection(collection_name) if collection_name not in [col.name for col in client.list_collections()] else client.get_collection(collection_name)
   return collection


In [1153]:
collection=load_collection("image_text_extraction")

In [1154]:
# Initialize the model to generate text embeddings
model_embd = SentenceTransformer("all-MiniLM-L6-v2")

In [1155]:
#Generate embeddings for all descriptions

import uuid
def Generate_embeddings_and_save_to_chromadb(model_embd,descriptions):
    #Insert data into the Chroma collection
    for key,value in descriptions["metadata"].items():
        print(key,value)
        print(descriptions["images"].keys())
        for k,v in descriptions["images"].items():
            #print("generated description---------->",k,v)
            embedding = model_embd.encode(v)
            #print("embedding_generated with length",len(embedding))
            collection.add(
                documents=[v],
                metadatas=[{key:value,"image_path":k}],
                embeddings=embedding,
                ids=[str(uuid.uuid4())]  # Use unique IDs for each chunk
            )
    return 1        

        

In [1156]:
Generate_embeddings_and_save_to_chromadb(model_embd,text_from_images)

domain finance
dict_keys(['output_images/page_dd9185383e19.jpg', 'output_images/ola2.png', 'output_images/page_ddb59e5e5ab7.jpg', 'output_images/ola1.png'])


1

In [1157]:
results = collection.get(
    where={
        "domain": {"$eq": "finance"}
    },
    include=["documents", "metadatas", "embeddings"]
)

In [1158]:
results

{'ids': ['0273cc88-ad6f-4d5a-b6cd-3bde43004e60',
  'c89d0a95-9a3d-401e-9262-7e935eed5b83',
  'b1235202-563e-47fc-b7a0-0f064688eec3',
  '1ee082db-8fe9-4330-83f8-e88a72cdfb37'],
 'embeddings': array([[-0.02682881,  0.009589  ,  0.03567362, ..., -0.0473114 ,
          0.03733041, -0.05703086],
        [ 0.00481167,  0.03125731, -0.00662304, ..., -0.01133677,
         -0.02686139, -0.04141881],
        [-0.01502887,  0.0667983 , -0.00836894, ..., -0.08391263,
          0.00020859,  0.02649617],
        [ 0.01140962,  0.03289916, -0.0247742 , ..., -0.03168653,
          0.00910755, -0.07547219]]),
 'documents': ['The image shows a screenshot of an order confirmation page on a mobile device, with the text Order Details at the top. The page is divided into sections, including Your Order, Grand Total, and Order Details.  Order Details  A.K. Juice Center  Nariman Point, Mumbai  Download invoice  Download summary  This order was delivered  This order was delivered  Your Order  Chikoo Milkshake  

In [1249]:
query="how much amonut i spent on travelling expense"

In [1250]:
def retrieve_documents(query,results,number_of_docs,domain):
    
    query_embedding = model_embd.encode(query)  
 
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=number_of_docs,
        where={"domain":domain}
                
    )
    
    return results  

In [1251]:
retrieved_data=retrieve_documents(query,results,2,"finance")
retrieved_data

{'ids': [['1ee082db-8fe9-4330-83f8-e88a72cdfb37',
   'c89d0a95-9a3d-401e-9262-7e935eed5b83']],
 'embeddings': None,
 'documents': [['The image shows a screenshot of an online payment page for a medical consultation with Dr. Munnalal Mishra. The page is divided into sections, including Ride Details, Bill Details, and Payment. Ride Details  Ride Details  MUNNALAL MISHRA  2.5 km  13 min  Prime SUV  Prime SUVWhite Ertiga  0639 PM  229, Ramnath Goenka Marg, Nariman Point, Mumbai, Maharashtra 400021, India  0703 PM  Bomy Hospital Neurology Centre, Bombay Hospital, 12, New Marine Lines, Marine Lines, Mumbai, Maharashtra, 400020, India Bill Details  Your Trip  Total Bill rounded  Includes 39.4 Taxes  Health Pack fee  Total Payable  Have queries Visit support for this ride.  Weve fulfilled our promise to take you to destination for pre-agreed Total Fare. Modifying the drop/route can change this fare. Payment  Paid by Cash  Paid by Cash The total bill for the ride is 558, which includes 39.4 tax

In [1252]:
def combine_documents(retrieved_data):
    combined_data=" ".join(retrieved_data["documents"][0])
    return combined_data    

In [1253]:
combined_documents=combine_documents(retrieved_data)
combined_documents

'The image shows a screenshot of an online payment page for a medical consultation with Dr. Munnalal Mishra. The page is divided into sections, including Ride Details, Bill Details, and Payment. Ride Details  Ride Details  MUNNALAL MISHRA  2.5 km  13 min  Prime SUV  Prime SUVWhite Ertiga  0639 PM  229, Ramnath Goenka Marg, Nariman Point, Mumbai, Maharashtra 400021, India  0703 PM  Bomy Hospital Neurology Centre, Bombay Hospital, 12, New Marine Lines, Marine Lines, Mumbai, Maharashtra, 400020, India Bill Details  Your Trip  Total Bill rounded  Includes 39.4 Taxes  Health Pack fee  Total Payable  Have queries Visit support for this ride.  Weve fulfilled our promise to take you to destination for pre-agreed Total Fare. Modifying the drop/route can change this fare. Payment  Paid by Cash  Paid by Cash The total bill for the ride is 558, which includes 39.4 taxes and a health pack fee of 8. The total payable amount is 566. The payment method is cash. The image shows a screenshot of a ride-s

In [1254]:
# from groq import Groq

# client = Groq()

# def generate_answer_based_on_query(client, model, user_input_text, user_query):
#     """
#     Function to process the input text and return an answer based on the user's query.
    
#     Parameters:
#     - client: The Groq client for making API requests.
#     - model: The model name to be used for generating the response.
#     - user_input_text: The user's input text (e.g., report, document).
#     - user_query: The query the user wants answered based on the input text.
    
#     Returns:
#     - response_text: The answer to the query generated by the model.
#     """
#     # Formulate the complete query by combining user input and query
#     full_query = f"Based on the following text, please answer the question: \n\n{user_input_text}\n\nQuestion: {user_query}"
    
#     # Create chat completion request using Groq client
    
#     chat_completion = client.chat.completions.create(
#             messages = [
#                 {
#                     "role": "system",
#                     "content": (
#                         "You are an assistant trained to read and understand text carefully, word by word. "
#                         "Analyze the provided text thoroughly and ensure that you answer the query with full accuracy. "
#                         "Do not skip or overlook any details in the text. "
#                         "Provide a complete answer based on the entire context provided in the input, and ensure all parts of the query are addressed. "
#                         "Do not add extra context, summaries, or elaborations—just provide the most accurate and precise answer based on the text and the user's query. "
#                         "Remember, do not refer to prior conversations or history; only the current input matters."
#                         f"{user_query}"
#                     )
#                 },
#                 {
#                     "role": "user",
#                     "content": full_query,  # Only the current query text
#                 }
#             ],
#     model=model,
#     temperature=0,
#     top_p=1,
#     stream=False,max_tokens=1024)

    
#     # Retrieve the response generated by the model
#     response_text = chat_completion.choices[0].message.content
    
#     return response_text


# # Example Usage
# # user_input_text = """
# # The financial report for Q4 shows an increase in revenue by 15% year-over-year, but also highlights some issues with expenses.
# # The company is planning to reduce operational costs to counterbalance rising expenses in the upcoming quarter.
# # """
# # user_query = "What is the revenue increase in Q4?"

# # model = "llama3-8b-8192"
# # final_answer = generate_answer_based_on_query(client, model, user_input_text, user_query)

# # print("Answer to the query:", final_answer)


In [1282]:
from groq import Groq

client = Groq()

def split_text(text, max_chunk_size):
    """
    Split text into smaller chunks based on a maximum character size.
    """
    return [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

def summarize_text(client, model, text,user_query):
    """
    Summarizes a text chunk using the model.
    """
    summary_response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": f"You are a summarization assistant. Read the provided text word by word with careful attention to detail. Summarize the key points without losing important details, and ensure that the summary includes a direct answer to the following query: {user_query}"
            },
            {
                "role": "user",
                "content": text
            }
        ],

        model=model,
        temperature=0.3,
        max_tokens=512  # Adjust based on the desired summary length
    )
    return summary_response.choices[0].message.content

def generate_answer_based_on_query(client, model, user_input_text, user_query):
    """
    Process large input text by summarizing chunks if necessary, then answer the query.
    """
    # Define the maximum chunk size to avoid exceeding model token limits (approximate).
    max_chunk_size = 5000  # Adjust based on your model's input capacity.

    # If the text is too large, split it into chunks and summarize each chunk.
    print(len(user_input_text))
    if len(user_input_text) > max_chunk_size:
        print("hdsgggggggggggggggggggg")
        chunks = split_text(user_input_text, max_chunk_size)
        summaries = [summarize_text(client, model, chunk,user_query) for chunk in chunks]
        processed_text = " ".join(summaries)  # Combine summaries
    else:
        processed_text = user_input_text  # Use the original text if it's within the limit.

    print(processed_text,"texxxxxxxxxxxxxxxxxxxxxt")
    print("texttttttttttttttttt")
    # Formulate the full query using the summarized or original text.
    full_query = f"Based on the following text, please answer the question:\n\n{processed_text}\n\nQuestion: {user_query}"

    # Generate an answer using the combined summaries and user query.
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a focused assistant. Answer the query based solely on the provided text, without additional context or assumptions."
            },
            {
                "role": "user",
                "content": full_query,
            }
        ],
        model=model,
        temperature=0.5,
        max_tokens=1024
    )

    # Retrieve the response generated by the model
    response_text = chat_completion.choices[0].message.content
    
    return response_text

# Example usage
# user_input_text = """
# The financial report for Q4 shows an increase in revenue by 15% year-over-year, but also highlights some issues with expenses.
# The company is planning to reduce operational costs to counterbalance rising expenses in the upcoming quarter.
# """
# user_query = "What is the revenue increase in Q4?"

# model = "llama3-8b-8192"
# final_answer = generate_answer_based_on_query(client, model, user_input_text, user_query)

# print("Answer to the query:", final_answer)


In [1290]:

#retrieved_data=results['documents']
def generate_answer(client, model,retrieved_data,Query):

    #combined_answer=" ".join(retrieved_data) 

    final_result=generate_answer_based_on_query(client, model, retrieved_data, Query)

    print(f"final output is : {final_result}")

    return final_result  #,answer_chunks

In [1291]:
print(combined_documents)

The image shows a screenshot of an online payment page for a medical consultation with Dr. Munnalal Mishra. The page is divided into sections, including Ride Details, Bill Details, and Payment. Ride Details  Ride Details  MUNNALAL MISHRA  2.5 km  13 min  Prime SUV  Prime SUVWhite Ertiga  0639 PM  229, Ramnath Goenka Marg, Nariman Point, Mumbai, Maharashtra 400021, India  0703 PM  Bomy Hospital Neurology Centre, Bombay Hospital, 12, New Marine Lines, Marine Lines, Mumbai, Maharashtra, 400020, India Bill Details  Your Trip  Total Bill rounded  Includes 39.4 Taxes  Health Pack fee  Total Payable  Have queries Visit support for this ride.  Weve fulfilled our promise to take you to destination for pre-agreed Total Fare. Modifying the drop/route can change this fare. Payment  Paid by Cash  Paid by Cash The total bill for the ride is 558, which includes 39.4 taxes and a health pack fee of 8. The total payable amount is 566. The payment method is cash. The image shows a screenshot of a ride-sh

In [1292]:
len(combined_documents)

2170

In [1293]:
client = Groq()
model = "llama3-70b-8192"
Query=query
Answer=generate_answer(client, model,combined_documents,Query)


2170
The image shows a screenshot of an online payment page for a medical consultation with Dr. Munnalal Mishra. The page is divided into sections, including Ride Details, Bill Details, and Payment. Ride Details  Ride Details  MUNNALAL MISHRA  2.5 km  13 min  Prime SUV  Prime SUVWhite Ertiga  0639 PM  229, Ramnath Goenka Marg, Nariman Point, Mumbai, Maharashtra 400021, India  0703 PM  Bomy Hospital Neurology Centre, Bombay Hospital, 12, New Marine Lines, Marine Lines, Mumbai, Maharashtra, 400020, India Bill Details  Your Trip  Total Bill rounded  Includes 39.4 Taxes  Health Pack fee  Total Payable  Have queries Visit support for this ride.  Weve fulfilled our promise to take you to destination for pre-agreed Total Fare. Modifying the drop/route can change this fare. Payment  Paid by Cash  Paid by Cash The total bill for the ride is 558, which includes 39.4 taxes and a health pack fee of 8. The total payable amount is 566. The payment method is cash. The image shows a screenshot of a ri

In [15]:
import os
from groq import Groq
import chromadb
from sentence_transformers import SentenceTransformer
import base64
import time
import uuid
from PIL import Image
from pdf2image import convert_from_path
from docx import Document
import numpy as np
import re 

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["GROQ_API_KEY"] = "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"






class OCRWithRAG:
    def __init__(self,domain_name,input_directory,db_collection_name):
        self.domain_name=domain_name
        self.input_directory=input_directory
        self.db_client = chromadb.Client()
        self.groq_client = Groq()
        # Initialize the model to generate text embeddings
        self.model_embd = SentenceTransformer("all-MiniLM-L6-v2")
        self.collection=self.create_collection(db_collection_name)

    def create_collection(self,collection_name):
        collection = self.db_client.create_collection(collection_name) if collection_name not in [col.name for col in self.db_client.list_collections()] else self.db_client.get_collection(collection_name)
        return collection
    
    def file_to_images(self,file_path, output_dir="output_images"):
        """
        Convert any supported file into images and save them to a directory.

        Args:
            file_path (str): Path to the input file.
            output_dir (str): Directory to save the output images.
        
        Returns:
            list: List of image file paths.
        """
        # Ensure output directory exists
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        file_extension = file_path.split('.')[-1].lower()
        output_files = []

        try:
            if file_extension == "pdf":
                # Convert PDF to images
                images = convert_from_path(file_path, dpi=300)
                for i, img in enumerate(images):
                    uid = str(uuid.uuid4()).split("-")[-1]
                    img_path = os.path.join(output_dir, f"page_{uid}.jpg")
                    img.save(img_path, "JPEG")
                    output_files.append(img_path)
            
            elif file_extension in ["doc", "docx"]:
                # Convert Word document to images
                doc = Document(file_path)
                text_content = "\n".join([p.text for p in doc.paragraphs])
                
                # Create an image for each page of text (approximation)
                img = Image.new("RGB", (800, 600), "white")
                from PIL import ImageDraw
                draw = ImageDraw.Draw(img)
                draw.multiline_text((10, 10), text_content, fill="black")
                img_path = os.path.join(output_dir, "document.jpg")
                img.save(img_path, "JPEG")
                output_files.append(img_path)
            
            elif file_extension in ["jpg", "jpeg", "png", "bmp", "tiff"]:
                # Handle image formats
                img = Image.open(file_path)
                img_path = os.path.join(output_dir, os.path.basename(file_path))
                img.save(img_path)  # Save the preprocessed image
                output_files.append(img_path)
            
            else:
                raise ValueError("Unsupported file type. Only PDF, DOC, DOCX, and image files are supported.")
        
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
        
        return output_files


    
    def load_files(self):
        extensions = (".pdf", ".docx", ".txt", "jpg", "jpeg", "png", "bmp", "tiff")
        domain_specific_images = {}
        domain_specific_images[self.domain_name]={}
        for file in os.listdir(self.input_directory):
            print(file,"file going to load....")
            if file.endswith(extensions):
                file_path = os.path.join(input_directory, file)
                img=self.file_to_images(file_path)
                domain_specific_images[self.domain_name][file] = img
        return domain_specific_images  
    
    # Function to encode the image
    def encode_image(slef,image_path):
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode('utf-8')

    def perform_OCR(self,domain_specific_images):
        text_from_images = {}
        text_from_images["metadata"]={}
        text_from_images["images"]={}
        for domain, files in domain_specific_images.items():
            for file in files.keys():
                print(file)
                for image in domain_specific_images[domain][file]:
                    print(image)
                    base64_image = self.encode_image(image)
                    chat_completion = self.groq_client.chat.completions.create(
                    messages = [
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": "You are an OCR extraction tool."
                                },
                                {
                                    "type": "text",
                                    "text": "Extract every character, word, and symbol exactly as it appears in the image."
                                },
                                {
                                    "type": "text",
                                    "text": "Do not refer to any historical memory or prior conversations—this is an independent chat."
                                },
                                {
                                    "type": "text",
                                    "text": "Do not repeat any values unless they appear multiple times in the image."
                                },
                                {
                                    "type": "text",
                                    "text": "Retain the original structure, including line breaks, paragraphs, tables, headers, footers, and any other formatting."
                                },
                                {
                                    "type": "text",
                                    "text": "Avoid any modifications, additions, or interpretations."
                                },
                                {
                                    "type": "text",
                                    "text": "Provide only the raw, unaltered text as shown, without repeating or omitting any information unless explicitly required by the image."
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64,{base64_image}",
                                    },
                                },
                            ],
                        }
                    ],

                    model="llama-3.2-11b-vision-preview",
                    temperature=0,  # Set temperature to 0 for more deterministic responses
                    top_p=1,
                    stream=False,
                    stop=None)
                    print("image description -----------------> ",chat_completion.choices[0].message.content)
                    time.sleep(50)
                    text_from_images["metadata"]["domain"] = domain
                    text_from_images["images"][image] = chat_completion.choices[0].message.content
        
        text_from_images_out=self.extract_clean_text(text_from_images)            
                    
        return text_from_images_out  

    def clean_text(self,text):
        # Remove extra whitespaces, newlines, and tabs by replacing all whitespace characters with a single space
        text = re.sub(r'\s+', ' ', text)
        
        # Define a regex pattern to match non-alphanumeric characters, but exclude valid symbols in numbers
        # This will match any character except letters, digits, spaces, periods, commas, slashes, dashes, and hashes
        text = re.sub(r'[^a-zA-Z0-9\s.,/#-]', '', text)
        
        # Remove leading and trailing spaces from the text
        text = text.strip()
        
        return text




    # Clean the text from the extracted images
    def extract_clean_text(self,text_from_images):
        for key,value in text_from_images["images"].items():
            # Clean the text
            text_from_images["images"][key] = self.clean_text(value)
        return text_from_images



    

    def Generate_embeddings_and_save_to_chromadb(self,descriptions):
        #Insert data into the Chroma collection
        for key,value in descriptions["metadata"].items():
            print(key,value)
            print(descriptions["images"].keys())
            for k,v in descriptions["images"].items():
                #print("generated description---------->",k,v)
                embedding = self.model_embd.encode(v)
                #print("embedding_generated with length",len(embedding))
                self.collection.add(
                    documents=[v],
                    metadatas=[{key:value,"image_path":k}],
                    embeddings=embedding,
                    ids=[str(uuid.uuid4())]  # Use unique IDs for each chunk
                )
        return 1 

   

                
        
        


In [10]:
input_directory="./input_dir"
domain="finance"
collection_name="finance"
RAGwithOCR=OCRWithRAG(domain,input_directory,collection_name)

In [13]:
load_files=RAGwithOCR.load_files()
print("loaded files:",load_files)

lunch1.pdf file going to load....
ola2.png file going to load....
lunch2.pdf file going to load....
ola1.png file going to load....
loaded files: {'finance': {'lunch1.pdf': ['output_images/page_6ce755919e3e.jpg'], 'ola2.png': ['output_images/ola2.png'], 'lunch2.pdf': ['output_images/page_cdc6e5fbde22.jpg'], 'ola1.png': ['output_images/ola1.png']}}


In [12]:
load_files

{'finance': {'lunch1.pdf': ['output_images/page_3d2b96b81439.jpg'],
  'ola2.png': ['output_images/ola2.png'],
  'lunch2.pdf': ['output_images/page_da389ea749d4.jpg'],
  'ola1.png': ['output_images/ola1.png']}}

In [16]:
ocr_inference=RAGwithOCR.perform_OCR(load_files)

lunch1.pdf
output_images/page_6ce755919e3e.jpg


image description ----------------->  The image shows a screenshot of an order confirmation page on a mobile device, with the text "Order Details" at the top. The page is divided into sections, including "Your Order", "Grand Total", and "Order Details".

*   **Order Details**
    *   A.K. Juice Center
        *   Nariman Point, Mumbai
    *   Download invoice
    *   Download summary
*   **This order was delivered**
    *   This order was delivered
*   **Your Order**
    *   Chikoo Milkshake
        *   Quantity: Full
        *   Item total: ₹140
        *   Taxes: ₹0.08
        *   Delivery charge: ₹0.08
        *   Donate ₹5 to Feeding India: ₹5.00
        *   Platform fee: ₹0.00
    *   Grand Total: ₹197.08
    *   Your total savings: ₹15
*   **Order Details**
    *   ORDER NUMBER: 6113265046
    *   PAYMENT: Paid: Using Upi (₹197.08)
    *   DATE: August 22, 2024 at 01:05 PM
    *   DELIVERY TO: Room number: 701, Envision hotel, arthus pondar road, near the metro station, pondar ro

In [17]:
ocr_inference

{'metadata': {'domain': 'finance'},
 'images': {'output_images/page_6ce755919e3e.jpg': 'The image shows a screenshot of an order confirmation page on a mobile device, with the text Order Details at the top. The page is divided into sections, including Your Order, Grand Total, and Order Details.  Order Details  A.K. Juice Center  Nariman Point, Mumbai  Download invoice  Download summary  This order was delivered  This order was delivered  Your Order  Chikoo Milkshake  Quantity Full  Item total 140  Taxes 0.08  Delivery charge 0.08  Donate 5 to Feeding India 5.00  Platform fee 0.00  Grand Total 197.08  Your total savings 15  Order Details  ORDER NUMBER 6113265046  PAYMENT Paid Using Upi 197.08  DATE August 22, 2024 at 0105 PM  DELIVERY TO Room number 701, Envision hotel, arthus pondar road, near the metro station, pondar road, Mumbai, Maharashtra, India  DELIVERED TO Room number 701, Envision hotel, arthus pondar road, near the metro station, pondar road, Mumbai, Maharashtra, India The o

In [23]:
create_and_save_embeddings=RAGwithOCR.Generate_embeddings_and_save_to_chromadb(ocr_inference)

domain finance
dict_keys(['output_images/page_6ce755919e3e.jpg', 'output_images/ola2.png', 'output_images/page_cdc6e5fbde22.jpg', 'output_images/ola1.png'])


In [36]:
class QAwithOCRRAG:
    def __init__(self,db_collection_name):
        self.db_client=chromadb.Client()
        self.llm_model="llama3-70b-8192"
        self.groq=Groq()
        self.collection = self.load_collection(db_collection_name)
        self.model_embd = SentenceTransformer("all-MiniLM-L6-v2")

    def load_collection(self,collection_name):
        collection = self.db_client.create_collection(collection_name) if collection_name not in [col.name for col in self.db_client.list_collections()] else self.db_client.get_collection(collection_name)
        return collection
        


    def retrieve_documents(self,query,number_of_docs,domain):
        query_embedding = self.model_embd.encode(query)  
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=number_of_docs,
            where={"domain":domain}
                    
        )
        
        return results

    def combine_documents(self,retrieved_data):
        combined_data=" ".join(retrieved_data["documents"][0])
        return combined_data


    def split_text(self,text, max_chunk_size):
        """
        Split text into smaller chunks based on a maximum character size.
        """
        return [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

    def summarize_text(self,text,user_query):
        """
        Summarizes a text chunk using the model.
        """
        summary_response = self.groq.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": f"You are a summarization assistant. Read the provided text word by word with careful attention to detail. Summarize the key points without losing important details, and ensure that the summary includes a direct answer to the following query: {user_query}"
                },
                {
                    "role": "user",
                    "content": text
                }
            ],

            model=self.llm_model,
            temperature=0.3,
            max_tokens=512  # Adjust based on the desired summary length
        )
        return summary_response.choices[0].message.content

    def generate_answer_based_on_query(self,user_input_text, user_query):
        """
        Process large input text by summarizing chunks if necessary, then answer the query.
        """
        # Define the maximum chunk size to avoid exceeding model token limits (approximate).
        max_chunk_size = 5000  # Adjust based on your model's input capacity.

        # If the text is too large, split it into chunks and summarize each chunk.
        if len(user_input_text) > max_chunk_size:
            chunks = split_text(user_input_text, max_chunk_size)
            summaries = [self.summarize_text(client, model, chunk,user_query) for chunk in chunks]
            processed_text = " ".join(summaries)  # Combine summaries
        else:
            processed_text = user_input_text  # Use the original text if it's within the limit.

       
        # Formulate the full query using the summarized or original text.
        full_query = f"Based on the following text, please answer the question:\n\n{processed_text}\n\nQuestion: {user_query}"

        # Generate an answer using the combined summaries and user query.
        chat_completion = self.groq.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a focused assistant. Answer the query based solely on the provided text, without additional context or assumptions."
                },
                {
                    "role": "user",
                    "content": full_query,
                }
            ],
            model=self.llm_model,
            temperature=0.5,
            max_tokens=1024
        )

        # Retrieve the response generated by the model
        response_text = chat_completion.choices[0].message.content

        return response_text
    

    #retrieved_data=results['documents']
    def generate_answer(self,retrieved_data,Query):

        #combined_answer=" ".join(retrieved_data) 

        final_result=self.generate_answer_based_on_query(retrieved_data, Query)

        print(f"final output is : {final_result}")

        return final_result

                
        
        


In [37]:
collection_name = "image_text_extraction"
QAOCR=QAwithOCRRAG(collection_name)

In [None]:
query="How much total amount i spend on food orders"

In [42]:
domain="finance"
number_of_docs=2
query="How much total amount i spend on food orders"
retrieve_documents=QAOCR.retrieve_documents(query,number_of_docs,domain)

In [43]:
retrieve_documents

{'ids': [['d7b94b2e-dfbb-4f4e-80c2-622bb232eea0',
   'e05d0bb1-583f-4759-88e7-30d33a57e521']],
 'embeddings': None,
 'documents': [['The image shows a screenshot of an order confirmation page from a food delivery app, with the text Order Details at the top. The page displays the order details, including the order number, payment method, and delivery address. Here is the raw, unaltered text from the image Order Details  Order Number 6111550384  Payment Method Paid - Using Upi 542.03  Delivery Address Charcoal Eats - Biryani  Beyond, Kemps Corner, Mumbai Order Summary  Item Paneer Makhani Biryani Serves 1 New  Quantity 1  Price 399  Total 399 Grand Total 542.03 Order Details  Order Number 6111550384  Payment Method Paid - Using Upi 542.03  Delivery Address Charcoal Eats - Biryani  Beyond, Kemps Corner, Mumbai Order Summary  Item Paneer Makhani Biryani Serves 1 New  Quantity 1  Price 399  Total 399 Grand Total 542.03 Repeat Order This is the only text present in the image.',
   'The image

In [44]:
Answer=QAOCR.generate_answer(retrieve_documents,query)

final output is : Based on the provided text, the total amount spent on food orders is:

1. Order 1: Grand Total = 542.03
2. Order 2: Grand Total = 197.08

Total amount spent on food orders = 542.03 + 197.08 = 739.11


### Below is the Query to delete data from vector database - use this if you required

In [20]:
client = chromadb.Client()

# Set up or create a collection in Chroma
collection_name = "image_text_extraction"
def load_collection(collection_name):
   collection = client.create_collection(collection_name) if collection_name not in [col.name for col in client.list_collections()] else client.get_collection(collection_name)
   return collection

collection=load_collection(collection_name)

In [21]:
results = collection.delete(
    where={
        "domain": {"$eq": "finance"}  # Using $eq operator for equality check
    }
)

results


In [22]:
results = collection.get(
    where={
        "domain": {"$eq": "finance"}
    },
    include=["documents", "metadatas", "embeddings"]
)
results

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}