In [4]:
!pip install python-dotenv # install the missing dotenv module

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [5]:
!pip install groq

Collecting groq
  Downloading groq-0.13.0-py3-none-any.whl.metadata (13 kB)
Downloading groq-0.13.0-py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.8/108.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.13.0


In [2]:
!pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [8]:
!apt-get update
!apt-get install poppler-utils

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,188 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,523 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,224 kB]
Get:13 http://archive.ub

In [9]:
pip install PyMuPDF python-dotenv groq Pillow

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.14


In [10]:
import base64
import pdf2image
import os
from dotenv import load_dotenv
from groq import Groq

load_dotenv()

class PDFOCRProcessor:
    def __init__(self, model_name="llama-3.2-90b-vision-preview"):
        self.model_name = model_name
        self.client = Groq(api_key='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')  # Replace with your actual API key

    def encode_image(self, image):
        """
        Encode image to base64

        Args:
            image (PIL.Image): Image to encode

        Returns:
            str: Base64 encoded image
        """
        # Convert PIL Image to bytes
        img_byte_arr = os.path.join(image)
        with open(img_byte_arr, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

    def perform_ocr(self, image_base64, temperature=1, max_tokens=1024, top_p=1):
        """
        Perform OCR on image using Llama 3.2 Vision

        Args:
            image_base64 (str): Base64 encoded image
            temperature (float): Sampling temperature
            max_tokens (int): Maximum tokens to generate
            top_p (float): Nucleus sampling parameter

        Returns:
            str: Extracted text from image
        """
        image_url = f"data:image/jpeg;base64,{image_base64}"

        prompt_text = (
            "You are an OCR assistant tasked with analyzing the provided image. "
            "Extract all visible text as accurately as possible. "
            "Provide only the transcription without any additional comments."
        )

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text},
                    {"type": "image_url", "image_url": {"url": image_url}},
                ],
            }
        ]

        completion = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=top_p,
            stream=False,
            stop=None,
        )

        return completion.choices[0].message.content

    def process_pdf(self, pdf_path, output_dir='ocr_output'):
        """
        Process each page of PDF through OCR

        Args:
            pdf_path (str): Path to PDF file
            output_dir (str): Directory to save OCR results

        Returns:
            list: OCR results for each page
        """
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Convert PDF to images
        images = pdf2image.convert_from_path(pdf_path)

        # Store OCR results
        ocr_results = []

        # Process each page
        for page_num, image in enumerate(images, 1):
            # Save temporary image
            temp_image_path = os.path.join(output_dir, f'page_{page_num}.jpg')
            image.save(temp_image_path, 'JPEG')

            # Encode image
            encoded_image = self.encode_image(temp_image_path)

            # Perform OCR
            page_text = self.perform_ocr(encoded_image)

            # Save page text
            text_output_path = os.path.join(output_dir, f'page_{page_num}_text.txt')
            with open(text_output_path, 'w', encoding='utf-8') as f:
                f.write(page_text)

            # Store and print result
            print(f"Page {page_num} OCR Result:")
            print(page_text)
            print("-" * 50)

            ocr_results.append(page_text)

            # Optional: Remove temporary image
            os.remove(temp_image_path)

        return ocr_results

def main():
    # Path to your PDF
    pdf_path = "/content/devops_pdf_data.pdf"

    # Initialize OCR Processor
    ocr_processor = PDFOCRProcessor()

    # Process PDF
    ocr_results = ocr_processor.process_pdf(pdf_path)

    # Optional: Combine all results into single file
    with open('full_ocr_results.txt', 'w', encoding='utf-8') as f:
        for page_num, result in enumerate(ocr_results, 1):
            f.write(f"--- Page {page_num} ---\n")
            f.write(result + "\n\n")

if __name__ == "__main__":
    main()

Page 1 OCR Result:
**PDC Assignment 2**

H-Prem 228-4161 855-50

(01) Blocked reports are sent to data nodes to NameNode periodically. Ensures that by the name node has an up-to-date view of block distribution across the cluster.

> Helps detect block loss or corruption

Headblocks are sent periodically by datanodes to the NameNode to indicate their health and availability.

> used to detect failed datanodes quickly

(02 1) Direct divides the file into blocks and requests NameNode to select Datanodes for block replication

(02 2) A pipeline of datanodes is formed with data flowing from direct to last datanode and then forwarded to next one.

(03) Each Datanode acknowledges when it receives data ensuring reliable transfer.

**Checkpoint Node:**

- Periodically renames fsimage into a new checkpoint

- This reduces the risk of losing remaining recent times and potential data loss.

**Backup Node:**

- Maintains a real-time synchronized namespace image with the NameNode.
------------------