In [None]:
!pip install pytesseract opencv-python-headless matplotlib tqdm

In [None]:
!apt-get update
!apt-get install -y tesseract-ocr tesseract-ocr-san
!pip install pytesseract opencv-python-headless matplotlib tqdm

# Import necessary libraries
import pytesseract
import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm
from pytesseract import Output

# Set up Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'

# Function to perform OCR on an image
def ocr_image(image_path):
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(Image.open(image_path), lang='san')
    data = pytesseract.image_to_data(gray, output_type=Output.DICT, lang='san')
    return text, data

# Function to load and preprocess images
def load_and_preprocess_images(folder):
    images = []
    for filename in os.listdir(folder):
        if filename.lower().endswith('.tif'):
            img = cv2.imread(os.path.join(folder, filename), cv2.IMREAD_GRAYSCALE)
            if img is not None:
                img = cv2.resize(img, (128, 128))  # Adjust size as needed
                images.append(img)
    return np.array(images)

# Load images from the folder
folder_path = '/kaggle/input/sans-sample/sans_sample'
images = load_and_preprocess_images(folder_path)
# Normalize the images
images = images.astype('float32') / 255.0

# Function to display an image with its OCR result
def display_image_with_ocr(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    text, _ = ocr_image(image_path)
    
    plt.figure(figsize=(10, 6))
    plt.imshow(image, cmap='gray')
    plt.title(f"OCR Result: {text}")
    plt.axis('off')
    plt.show()

# Perform OCR on images and print results
ocr_results = []
image_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.tif')]

for filename in tqdm(image_files, desc="Processing Images"):
    image_path = os.path.join(folder_path, filename)
    text, data = ocr_image(image_path)
    ocr_results.append((filename, text, data))

# Function to visualize OCR results
def visualize_ocr(image_path, ocr_data):
    image = cv2.imread(image_path)
    n_boxes = len(ocr_data['level'])
    for i in range(n_boxes):
        (x, y, w, h) = (ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i])
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
    
    # Convert image from BGR to RGB
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Display the image
    plt.figure(figsize=(12, 12))
    plt.imshow(image)
    plt.axis('off')
    plt.show()

# Visualize OCR results for each image
for filename, text, data in ocr_results:
    image_path = os.path.join(folder_path, filename)
    print(f"OCR Result for {filename}:\n{text}\n")
    visualize_ocr(image_path, data)