In [1]:
import os
from PIL import Image, ImageEnhance, ImageFilter
import json
import pytesseract  # For OCR
from transformers import BlipProcessor, BlipForConditionalGeneration
from spellchecker import SpellChecker
import re

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def describe_image(image_path, num_captions=3):
    """Generate multiple captions for an image."""
    # Load the processor and model
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
    
    # Open and convert the image to RGB
    raw_image = Image.open(image_path).convert('RGB')

    # Process the image
    inputs = processor(raw_image, return_tensors="pt") 

    # Generate multiple captions using beam search
    outputs = model.generate(
        **inputs,
        max_length=100,              # Increase for detailed captions
        num_return_sequences=num_captions,  # Generate multiple captions
        num_beams=num_captions * 2   # Increase the beam size for diversity
    )

    # Decode each output to get the captions
    captions = [processor.decode(output, skip_special_tokens=True) for output in outputs]
    
    return captions

def preprocess_image(image_path):
    """
    Preprocess the image to improve OCR accuracy for English, Hindi, and Nepali.
    """
    # Open the image
    image = Image.open(image_path).convert('RGB')
    
    # Convert to grayscale
    image = image.convert('L')
    
    # Enhance contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)  # Increase contrast
    
    # Apply a slight blur to reduce noise
    image = image.filter(ImageFilter.SMOOTH)
    
    # Binarize the image (black and white)
    image = image.point(lambda x: 0 if x < 128 else 255, '1')
    
    return image

def postprocess_text(text):
    spell = SpellChecker()
    corrected_text = []
    for word in text.split():
        corrected_word = spell.correction(word) or word
        corrected_text.append(corrected_word)
    return ' '.join(corrected_text)

# def clean_extracted_text(text):
#     """
#     Clean the extracted text using regex and basic post-processing.
#     """
#     # Remove unwanted characters
#     text = re.sub(r'[^\w\s\.\-/]', ' ', text)  # Keep alphanumeric, spaces, dots, hyphens, and slashes
#     text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
#     return text.strip()

def extract_text_from_image(image_path):
    """
    Extract text from an image using Tesseract OCR for English, Hindi, and Nepali.
    """
    try:
        # Preprocess the image
        processed_image = preprocess_image(image_path)
        
        # Use Tesseract with custom configurations for English, Hindi, and Nepali
        custom_config = r'--oem 3 --psm 6 -l eng+hin+nep'
        text = pytesseract.image_to_string(processed_image, config=custom_config)
        
        # check spelling
        text = postprocess_text(text)
        
        return text
    except Exception as e:
        print(f"Error extracting text from {image_path}: {e}")
        return None

def process_images_in_folder(folder_path, output_file):
    outputs = {
        'images': []
    }
    photos = [i for i in os.listdir(folder_path) if i.endswith(('.png', '.jpg', '.jpeg','.bmp', '.gif','.JPG'))] 
    for photo in photos:
        image_path = os.path.join(folder_path, photo)
        try:
            # Generate image description using BLIP
            captions = describe_image(image_path)
            description= '\n'.join(captions)
            print("this is description", description)
            
            # Extract text from the image using OCR
            extracted_text = extract_text_from_image(image_path)
            
            # Store the results
            image_data = {
                'filename': photo,  # photo: 1.jpg
                'path': image_path,
                'description': description,
                'extracted_text': extracted_text if extracted_text else "No text found"
            }
            
            outputs['images'].append(image_data)
            print(f"Description and text extracted for {photo}")
            
        except Exception as e:
            print(f"Error processing {photo}: {e}")
    
    print("This is output", outputs)     
            
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(outputs, f, indent=2, ensure_ascii=False)

    except Exception as e:
        print(f"Error saving descriptions to {output_file}: {e}")
        
        
folder_path = './images'
output_file = 'output.json'
# Process images and save descriptions
print("=== Processing Images ===")
process_images_in_folder(folder_path, output_file)
print(f"Descriptions saved to {output_file}")

  from .autonotebook import tqdm as notebook_tqdm


=== Processing Images ===
this is description a close up of a piece of paper with a picture of a man on it
a close up of a document with a picture of a man on it
a close up of a paper with a picture of a man on it
Description and text extracted for image1.jpg
This is output {'images': [{'filename': 'image1.jpg', 'path': './images\\image1.jpg', 'description': 'a close up of a piece of paper with a picture of a man on it\na close up of a document with a picture of a man on it\na close up of a paper with a picture of a man on it', 'extracted_text': 'नेपाल सरकार हनन 8 अर्थ मन्त्रालय 4 आन्तरिक राजश्व विभाग स्थायी लेखा नम्बर pan दर्ता प्रमाण पत्र सास्ती en 2 to स्थासी लेखा amy ce i to ER रिता axe the eye a a so all Pray दिन feet साल कारोबारको नाज नेपाल काइल्डरनेश pee प्रा.लि : i करदालोको to प्राइभेट मिमिटेड did to ad ~ be बाई i i पकनाजोम i drama 58 या, son करदाताको दस्तबत i अधिक दक्लझत a छारद्राताले पालखा जर्तापर्ले फर्सटजहूरु:. बार see it fee माय my Som em it i ne i fen Ref me 22525 552 fun