# OCR Test with Tesseract

## Requirements
1. **Install Tesseract OCR**: 
   - Install Tesseract: `winget install --id UB-Mannheim.TesseractOCR` 
   - add it to system varibales 
2. **Python packages**: `pip install opencv-python pytesseract pillow`

In [1]:
import cv2
import pytesseract
from PIL import Image
import os

# Load image - corrected file extension
image_path = "./Tests files/test_image_only.png"

# Check if file exists
if not os.path.exists(image_path):
    print(f"Error: File '{image_path}' not found!")
    print("Available files in Tests files directory:")
    for file in os.listdir("./Tests files"):
        print(f"  - {file}")
else:
    img = cv2.imread(image_path)
    
    # Check if image was loaded successfully
    if img is None:
        print(f"Error: Could not load image from '{image_path}'")
        print("Make sure the file is a valid image format.")
    else:
        print(f"Successfully loaded image: {image_path}")
        print(f"Image shape: {img.shape}")
        
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        
        # Optional preprocessing
        gray = cv2.GaussianBlur(gray, (5,5), 0)
        gray = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                     cv2.THRESH_BINARY, 11, 2)

        # OCR with error handling
        try:
            text = pytesseract.image_to_string(gray)
            print("Extracted text:")
            print(text)

            # Create output directory if it doesn't exist
            output_dir = "Tests output"
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
                print(f"Created directory: {output_dir}")

            # Save to file
            output_file = os.path.join(output_dir, "output_from_image.txt")
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"Text saved to '{output_file}'")
            
        except pytesseract.TesseractNotFoundError:
            print("ERROR: Tesseract is not installed or not in PATH!")
            print("\nTo install Tesseract:")
            print("1. Download from: https://github.com/UB-Mannheim/tesseract/wiki")
            print("2. Or use chocolatey: choco install tesseract")
            print("3. Or use conda: conda install -c conda-forge tesseract")
            print("4. Make sure to add Tesseract to your system PATH")
            print("5. Or run the cell above to auto-configure the path")
            
        except Exception as e:
            print(f"OCR Error: {e}")

Successfully loaded image: ./Tests files/test_image_only.png
Image shape: (206, 1067, 3)
Extracted text:
Experiment-07

Roll No: A3-754

Aim: To study & implement Part-of-Speech (POS) tagging using the Viterbi Algorithm in Hidden Markov
Models (HMM)

Text saved to 'Tests output\output_from_image.txt'


In [1]:
import pdfplumber

pdf_path = "./Tests files/test_text_only.pdf"
output_txt = "./Tests output/output_from_text.txt"

with pdfplumber.open(pdf_path) as pdf:
    full_text = ""
    for page in pdf.pages:
        full_text += page.extract_text() + "\n"

with open(output_txt, "w", encoding="utf-8") as f:
    f.write(full_text)

print(f"Text extracted and saved to {output_txt}")

Text extracted and saved to ./Tests output/output_from_text.txt
