In [1]:
import pandas as pd
import torch
from PIL import Image
import easyocr
from tqdm.auto import tqdm
import gc
import os

def setup_ocr():
    """Initialize EasyOCR with GPU support"""
    # Force GPU usage
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    reader = easyocr.Reader(['en'], gpu=True)
    return reader

def process_batch(image_paths, reader, batch_size=64, pbar=None):
    """Process a batch of images efficiently"""
    results = []
    
    # Process images in batches
    for i in range(0, len(image_paths), batch_size):
        batch = image_paths[i:i + batch_size]
        
        # Read and process batch
        batch_results = []
        for img_path in batch:
            try:
                # Read the text
                texts = reader.readtext(img_path)
                # Concatenate all detected text
                full_text = ' '.join([text[1] for text in texts])
                batch_results.append(full_text)
            except Exception as e:
                print(f"Error processing {img_path}: {str(e)}")
                batch_results.append('')
            
            if pbar is not None:
                pbar.update(1)
        
        results.extend(batch_results)
        
        # Clear GPU memory
        torch.cuda.empty_cache()
        gc.collect()
    
    return results

def ocr_dataframes(df1, df2, image_path_col='image_path', batch_size=64):
    """Process OCR for two dataframes with image paths"""
    # Initialize OCR
    print("Initializing EasyOCR...")
    reader = setup_ocr()
    
    total_images = len(df1) + len(df2)
    
    # Create master progress bar
    with tqdm(total=total_images, desc="Total Progress", position=0) as master_pbar:
        # Process first DataFrame
        print("\nProcessing first DataFrame...")
        df1['ocr_text'] = process_batch(
            df1[image_path_col].tolist(),
            reader,
            batch_size,
            master_pbar
        )
        
        # Process second DataFrame
        print("\nProcessing second DataFrame...")
        df2['ocr_text'] = process_batch(
            df2[image_path_col].tolist(),
            reader,
            batch_size,
            master_pbar
        )
    
    return df1, df2


# Example usage
if __name__ == "__main__":
    # Load your DataFrames
    train_df = pd.read_csv("/kaggle/input/visual-taxonomy/train.csv")
    test_df = pd.read_csv("/kaggle/input/visual-taxonomy/test.csv")
    train_images = "/kaggle/input/visual-taxonomy/train_images/"
    test_images = "/kaggle/input/visual-taxonomy/test_images/"
    
    # Generate image paths and process
    train_df["image_path"] = train_df["id"].apply(lambda x: os.path.join(train_images, f"{str(x).zfill(6)}.jpg"))
    test_df["image_path"] = test_df["id"].apply(lambda x: os.path.join(test_images, f"{str(x).zfill(6)}.jpg"))


    # Filter for the 'Sarees' category
    df1 = train_df
    df2 = test_df

    
    # Process OCR
    df1_processed, df2_processed = ocr_dataframes(df1, df2)
    
    # Save results
    df1_processed.to_csv('df1_with_ocr.csv', index=False)
    df2_processed.to_csv('df2_with_ocr.csv', index=False)

Initializing EasyOCR...


Total Progress:   0%|          | 0/100418 [00:00<?, ?it/s]


Processing first DataFrame...

Processing second DataFrame...
