In [3]:
import pandas as pd
df = pd.read_csv('table_1.csv')

# Display the DataFrame
df.tail()

Unnamed: 0.1,Unnamed: 0,US$ FY,FY,FY.1,FY.2,FY.3,FY.4,FY.5,FY.6,FY.7,...,Unnamed: 2,FY.10,FY.11,FY.12,FY.13,FY.14,FY.15,FY.16,FY.17,FY.18
48,,,,,,,,,,,...,,14.5,10.1,16.2,24.9,28.7,25.4,17.2,12.7,11.5
49,RONW (%)**,-15.0,13.2,13.1,12.7,15.1,16.9,16.4,16.5,13.5,...,,,,,,,,,,
50,,,,,,,,,,,...,,,,,,,,,,
51,ROCE (%)**,-13.3,12.7,10.6,12.0,13.5,13.6,13.5,13.0,12.8,...,,,,,,,,,,
52,,,,,,,,,,,...,,,,,,,,,,


In [None]:
import numpy as np

sentences = [
    "It's such a beautiful day outside.",
    "The weather is amazing today.",
    "She programmed a chatbot using Python.",
    "The detective found a hidden clue."
]
import torch

# Manual vector representation (for Desmos 3D visualization)
vectors = torch.tensor(np.array([
    [10.0, 15.0, 20.0],  # Beautiful day outside
    [12.0, 16.0, 18.0],  # Amazing weather
    [5.0 , -8.0, -3.0],    # Python chatbot
    [-15.0, -6.0, 8.0]     # Hidden clue
]), dtype=torch.float32)

def consine_similarity(input_embedding, sentence_embeddings):
  # calculate cosine_similarity of input_embedding and each of the sentence_embeddings
  # input_embedding: tensor of shape (1,384)
  # sentence_embeddings: tensor of shape (50, 384)
  # returns: tensor of shape (50)
  prod = torch.matmul(input_embedding,sentence_embeddings.T)
  prod /= torch.norm(input_embedding, dim=1, keepdim=True)
  prod /= torch.norm(sentence_embeddings, dim=1, keepdim=True).T
  return prod
query = torch.tensor(np.array([[11, 14, 13]]), dtype=torch.float32)
print(query.shape)
# Calculate cosine similarity
similarity_matrix = consine_similarity(query, vectors)

print("Vectors:")
for i, vec in enumerate(vectors):
    print(f"Sentence {i+1}: {sentences[i]} - Vector: {vec}")

print("\nCosine Similarity Matrix:")
print(similarity_matrix)

torch.Size([1, 3])
Vectors:
Sentence 1: It's such a beautiful day outside. - Vector: tensor([10., 15., 20.])
Sentence 2: The weather is amazing today. - Vector: tensor([12., 16., 18.])
Sentence 3: She programmed a chatbot using Python. - Vector: tensor([ 5., -8., -3.])
Sentence 4: The detective found a hidden clue. - Vector: tensor([-15.,  -6.,   8.])

Cosine Similarity Matrix:
tensor([[ 0.9771,  0.9946, -0.4399, -0.3648]])


In [None]:
import os
import tempfile
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

def ocr_pdf(pdf_path, output_folder=None, language='eng', dpi=300):
    """
    Extract text from a PDF using Tesseract OCR.
    
    Args:
        pdf_path (str): Path to the PDF file
        output_folder (str, optional): Folder to save extracted text files. If None, doesn't save files.
        language (str, optional): Tesseract language code. Default is 'eng' for English. 
                                 Use '+' to combine languages, e.g., 'eng+fra' for English and French.
        dpi (int, optional): DPI for PDF to image conversion. Higher values give better quality but slower processing.
    
    Returns:
        list: List of dictionaries with page number and extracted text for each page
    """
    results = []
    
    # Create output folder if specified and doesn't exist
    if output_folder and not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Create a temporary directory for images
    with tempfile.TemporaryDirectory() as temp_dir:
        print(f"Converting PDF to images (DPI: {dpi})...")
        
        # Convert PDF to images
        images = convert_from_path(
            pdf_path, 
            dpi=dpi,
            output_folder=temp_dir,
            fmt='png',
            thread_count=os.cpu_count()
        )
        
        print(f"Converted {len(images)} pages. Starting OCR process...")
        
        # Process each page
        for i, image in enumerate(images):
            page_num = i + 1
            print(f"Processing page {page_num}/{len(images)}...")
            
            # Improve image quality for OCR
            image = preprocess_image(image)
            
            # Apply OCR to the image
            text = pytesseract.image_to_string(image, lang=language)
            
            # Save the extracted text to a file if output folder is specified
            if output_folder:
                output_file = os.path.join(output_folder, f"page_{page_num}.txt")
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(text)
            
            # Add result to list
            results.append({
                'page': page_num,
                'text': text
            })
            
    return results

def preprocess_image(image):
    """
    Preprocess image to improve OCR quality.
    
    Args:
        image (PIL.Image): Original image
        
    Returns:
        PIL.Image: Processed image
    """
    # Convert to grayscale
    image = image.convert('L')
    
    # Optional: Add more preprocessing steps if needed for your specific documents
    # Examples:
    # - image = image.filter(ImageFilter.SHARPEN)  # Sharpen image
    # - image = ImageOps.autocontrast(image)  # Increase contrast
    
    return image

def ocr_pdf_with_bbox(pdf_path, language='eng', dpi=300):
    """
    Extract text with bounding boxes from a PDF using Tesseract OCR.
    
    Args:
        pdf_path (str): Path to the PDF file
        language (str, optional): Tesseract language code. Default is 'eng'.
        dpi (int, optional): DPI for PDF to image conversion. Higher values give better quality.
    
    Returns:
        list: List of dictionaries with page number and extracted text data including bounding boxes
    """
    results = []
    
    # Create a temporary directory for images
    with tempfile.TemporaryDirectory() as temp_dir:
        # Convert PDF to images
        images = convert_from_path(
            pdf_path, 
            dpi=dpi,
            output_folder=temp_dir,
            fmt='png',
            thread_count=os.cpu_count()
        )
        
        # Process each page
        for i, image in enumerate(images):
            page_num = i + 1
            
            # Improve image quality for OCR
            image = preprocess_image(image)
            
            # Get OCR data with bounding boxes
            ocr_data = pytesseract.image_to_data(
                image, 
                lang=language,
                output_type=pytesseract.Output.DICT
            )
            
            # Group OCR data into structured format
            page_data = {
                'page': page_num,
                'width': image.width,
                'height': image.height,
                'elements': []
            }
            
            # Process text elements with their positions
            n_boxes = len(ocr_data['text'])
            for j in range(n_boxes):
                # Skip empty text
                if int(ocr_data['conf'][j]) > 0:  # Filter by confidence
                    element = {
                        'text': ocr_data['text'][j],
                        'confidence': ocr_data['conf'][j],
                        'bbox': {
                            'x': ocr_data['left'][j],
                            'y': ocr_data['top'][j],
                            'width': ocr_data['width'][j],
                            'height': ocr_data['height'][j]
                        },
                        'block_num': ocr_data['block_num'][j],
                        'line_num': ocr_data['line_num'][j],
                        'word_num': ocr_data['word_num'][j]
                    }
                    page_data['elements'].append(element)
            
            results.append(page_data)
            
    return results

def main():
    """Main function to demonstrate usage."""
    pdf_path = "RIL-70-21.pdf"  # Change to your PDF file
    output_folder = "ocr_output"
    
    # Basic OCR
    print("Running OCR on PDF...")
    ocr_results = ocr_pdf(pdf_path, output_folder)
    
    # Print summary
    print("\nOCR Results Summary:")
    for page in ocr_results:
        print(f"Page {page['page']}: {len(page['text'])} characters extracted.")
    
    # Optionally, demonstrate the bounding box functionality
    print("\nExtract text with layout information...")
    layout_results = ocr_pdf_with_bbox(pdf_path)
    
    print(f"\nProcessed {len(layout_results)} pages with layout information.")
    print(f"Text files saved to {output_folder}")


main()

In [3]:
from doctr.io import DocumentFile
from doctr.models import ocr_predictor


In [4]:

import time
start_time = time.time()
print("Loading OCR model...")
model = ocr_predictor(pretrained=True)
end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f} seconds")
# PDF
start_doc_time = time.time()
doc = DocumentFile.from_pdf("scanned.pdf")
# Analyze
result = model(doc)
end_doc_time = time.time()
print(f"Document processing time: {end_doc_time - start_doc_time:.2f} seconds")
print(result)

Loading OCR model...
Downloading https://doctr-static.mindee.com/models?id=v0.8.1/fast_base-688a8b34.pt&src=0 to /Users/swastikagrawal/.cache/doctr/models/fast_base-688a8b34.pt


  0%|          | 0/65814772 [00:00<?, ?it/s]

Downloading https://doctr-static.mindee.com/models?id=v0.3.1/crnn_vgg16_bn-9762b0b0.pt&src=0 to /Users/swastikagrawal/.cache/doctr/models/crnn_vgg16_bn-9762b0b0.pt


  0%|          | 0/63286381 [00:00<?, ?it/s]

Model loaded in 8.28 seconds
Document(
  (pages): [
    Page(
      dimensions=(1660, 1190)
      (blocks): [Block(
        (lines): [
          Line(
            (words): [
              Word(value='SWASTI', confidence=0.35),
              Word(value='AG', confidence=0.9),
              Word(value='RAWAL', confidence=0.68),
            ]
          ),
          Line(
            (words): [
              Word(value='CMSC351', confidence=0.99),
              Word(value='Spring', confidence=0.99),
              Word(value='2025', confidence=1.0),
              Word(value='$0101,90201,90301)', confidence=0.57),
              Word(value='Homework', confidence=0.65),
              Word(value='7', confidence=1.0),
            ]
          ),
          Line(
            (words): [
              Word(value='Due', confidence=1.0),
              Word(value='WEDNESDAY', confidence=0.94),
              Word(value='Apr', confidence=1.0),
              Word(value='2,', confidence=0.99),
              