In [1]:
import os
import json
import pandas as pd
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T
import torch
import matplotlib.pyplot as plt
from PIL import Image
import pytesseract
from sklearn.metrics import precision_recall_fscore_support
from Levenshtein import distance as levenshtein_distance
from ultralytics import YOLO
#import easyocr
import logging
import pytesseract
import editdistance
from tabulate import tabulate
import random

In [2]:
import os
import pandas as pd

# Paths
image_folder = 'trained_images'  # Folder where you uploaded images
original_csv_file = 'sub_annot.csv'  # Original annot.csv file
filtered_csv_file = 'annot.csv'  # Path for the new filtered CSV

# Load the original annotation CSV file
annot_df = pd.read_csv(original_csv_file)

# Get the list of image files currently present in the folder
image_files_in_folder = set(os.listdir(image_folder))

# Filter annotations based on available images in the folder
available_image_ids = set([os.path.splitext(img)[0] for img in image_files_in_folder if img.endswith(('.jpg', '.png'))])

# Filter the annotations DataFrame to only include available image IDs
filtered_annot_df = annot_df[annot_df['image_id'].isin(available_image_ids)]

# Save the filtered annotations to a new CSV file
filtered_annot_df.to_csv(filtered_csv_file, index=False)

# Print some stats and a success message
print(f"Total images in the folder: {len(available_image_ids)}")
print(f"Total images with annotations: {len(filtered_annot_df['image_id'].unique())}")
print(f"Filtered annotations saved to: {filtered_csv_file}")


FileNotFoundError: [Errno 2] No such file or directory: 'sub_annot.csv'

In [2]:
# Constants
DATA_DIR = 'trained_images'
ANNOT_PATH = 'annot.csv'
IMAGE_OUTPUT_DIR = 'dataset/images'
LABEL_OUTPUT_DIR = 'dataset/labels'
SUBSET_RATIO = 0.8

# Prepare directories
def prepare_directories(image_output_dir, label_output_dir):
    os.makedirs(image_output_dir, exist_ok=True)
    os.makedirs(label_output_dir, exist_ok=True)
    print(f"Directories created at {image_output_dir} and {label_output_dir}")

# Create directories for images and labels
prepare_directories(IMAGE_OUTPUT_DIR, LABEL_OUTPUT_DIR)


Directories created at dataset/images and dataset/labels


In [3]:
# Function to split and reduce dataset size
def reduce_dataset_size(image_ids, subset_ratio=0.02):
    train_ids, test_ids = train_test_split(image_ids, test_size=0.03, random_state=42)
    train_ids, val_ids = train_test_split(train_ids, test_size=0.2, random_state=42)
    small_train_ids, _ = train_test_split(train_ids, test_size=1-subset_ratio, random_state=42)
    small_val_ids, _ = train_test_split(val_ids, test_size=1-subset_ratio, random_state=42)
    small_test_ids, _ = train_test_split(test_ids, test_size=1-subset_ratio, random_state=42)
    return small_train_ids, small_val_ids, small_test_ids

# Load and split data
annot_df = pd.read_csv(ANNOT_PATH)
image_ids = annot_df['image_id'].unique()
small_train_ids, small_val_ids, small_test_ids = reduce_dataset_size(image_ids, SUBSET_RATIO)

In [4]:
# Function to convert bounding boxes to YOLO format
def convert_to_yolo_format(size, box):
    dw = 1. / size[0]
    dh = 1. / size[1]
    x = (box[0] + box[2]) / 2.0
    y = (box[1] + box[3]) / 2.0
    w = box[2] - box[0]
    h = box[3] - box[1]
    x = max(0, min(1, x * dw))
    y = max(0, min(1, y * dh))
    w = max(0, min(1, w * dw))
    h = max(0, min(1, h * dh))
    return (x, y, w, h)
    
# Copy images and create label files in YOLO format
def process_images_and_labels(image_ids, annot_df):
    for image_id in image_ids:
        img_path = os.path.join(DATA_DIR, f'{image_id}.jpg')
        img = cv2.imread(img_path)
        h, w, _ = img.shape
        
        new_img_path = os.path.join(IMAGE_OUTPUT_DIR, f'{image_id}.jpg')
        cv2.imwrite(new_img_path, img)
        
        records = annot_df[annot_df['image_id'] == image_id]
        boxes = records['bbox'].apply(json.loads).tolist()
        boxes = [[b[0], b[1], b[0] + b[2], b[1] + b[3]] for b in boxes]
        
        label_file_path = os.path.join(LABEL_OUTPUT_DIR, f'{image_id}.txt')
        with open(label_file_path, 'w') as f:
            for box in boxes:
                yolo_box = convert_to_yolo_format((w, h), box)
                f.write(f'0 {yolo_box[0]} {yolo_box[1]} {yolo_box[2]} {yolo_box[3]}\n')

In [10]:
# Process images and labels, only run when the dataset folder is not already in place.
#process_images_and_labels(image_ids, annot_df)


Premature end of JPEG file
Premature end of JPEG file
Premature end of JPEG file


In [6]:
# Create the text files listing paths to images
def create_text_file(image_ids, file_path):
    with open(file_path, 'w') as f:
        for image_id in image_ids:
            img_path = os.path.join(os.getcwd(), IMAGE_OUTPUT_DIR, f'{image_id}.jpg')
            f.write(f'{img_path}\n')

In [7]:
create_text_file(small_train_ids, 'dataset/train.txt')
create_text_file(small_val_ids, 'dataset/val.txt')
create_text_file(small_test_ids, 'dataset/test.txt')

print("Text files created successfully.")

Text files created successfully.


In [8]:
class TextOCRDataset(Dataset):
    def __init__(self, image_ids, annot_df, data_dir, transforms=None, target_size=(640, 640)):
        self.image_ids = image_ids
        self.annot_df = annot_df
        self.data_dir = data_dir
        self.transforms = transforms
        self.target_size = target_size

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]
        img_path = os.path.join(self.data_dir, f'{image_id}.jpg')
        img = Image.open(img_path).convert("RGB")
        original_width, original_height = img.size

        # Apply transformations
        if self.transforms:
            img = self.transforms(img)

        # Rescale bounding boxes to match the resized image dimensions
        records = self.annot_df[self.annot_df['image_id'] == image_id]
        boxes = records['bbox'].apply(json.loads).tolist()
        boxes = [[b[0] * self.target_size[0] / original_width, 
                  b[1] * self.target_size[1] / original_height,
                  (b[0] + b[2]) * self.target_size[0] / original_width,
                  (b[1] + b[3]) * self.target_size[1] / original_height] for b in boxes]

        target = {
            'boxes': torch.as_tensor(boxes, dtype=torch.float32),
            'labels': torch.ones((len(boxes),), dtype=torch.int64)
        }
        return img, target

    def __len__(self):
        return len(self.image_ids)

# Data Transforms
transform = T.Compose([
    T.Resize((640, 640)),
    T.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
#    T.RandomHorizontalFlip(),
    T.ToTensor()
])


In [9]:
# Data Loaders
small_train_dataset = TextOCRDataset(small_train_ids, annot_df, DATA_DIR, transforms=transform)
val_dataset = TextOCRDataset(small_val_ids, annot_df, DATA_DIR, transforms=transform)
test_dataset = TextOCRDataset(small_test_ids, annot_df, DATA_DIR, transforms=transform)

def collate_fn(batch):
    return tuple(zip(*batch))

small_train_loader = DataLoader(small_train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)


In [10]:
# Print dataset sizes to verify reduction
print(f"Training dataset size: {len(small_train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Training dataset size: 1892
Validation dataset size: 473
Test dataset size: 73


In [10]:
def visualize_data(dataset, idx=0, color=(0, 255, 0), thickness=2, figsize=(10, 10)):
    img, target = dataset[idx]
    boxes = target['boxes'].cpu().numpy()
    img = (img.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    img = np.ascontiguousarray(img)

    # Draw bounding boxes
    for box in boxes:
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)

    # Display the image with bounding boxes using matplotlib
    plt.figure(figsize=figsize)
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

# Visualize some training data
visualize_data(small_train_dataset, idx=0)
visualize_data(small_train_dataset, idx=1)

OverflowError: Python integer -14 out of bounds for uint8

In [20]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available, using CPU.")

GPU is not available, using CPU.


In [None]:
# Train YOLOv3

model = YOLO('yolov3u.pt')
model.train(data='config.yaml', epochs=10, imgsz=320, batch=5, device='cpu')

New https://pypi.org/project/ultralytics/8.3.23 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.20 🚀 Python-3.11.10 torch-2.3.1+cu118 CPU (Intel Xeon 2.00GHz)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov3u.pt, data=config.yaml, epochs=10, time=None, patience=100, batch=5, imgsz=320, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, 

[34m[1mtrain: [0mScanning /home/student/dataset/labels... 1892 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1892/1892 [0m

[34m[1mtrain: [0mNew cache created: /home/student/dataset/labels.cache



[34m[1mval: [0mScanning /home/student/dataset/labels... 473 images, 0 backgrounds, 0 corrupt: 100%|██████████| 473/473 [00:0[0m

[34m[1mval: [0mNew cache created: /home/student/dataset/labels.cache





Plotting labels to runs/detect/train/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 84 weight(decay=0.0), 91 weight(decay=0.0005078125), 90 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 320 train, 320 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train[0m
Starting training for 10 epochs...
Closing dataloader mosaic

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/10         0G      2.637      1.898      1.392         44        320: 100%|██████████| 379/379 [21:51<00:
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 48/48 [

                   all        473      20174      0.255      0.136      0.101     0.0366






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/10         0G       2.65      1.766      1.381         96        320: 100%|██████████| 379/379 [21:15<00:
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 48/48 [

                   all        473      20174      0.107      0.151     0.0345     0.0123






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/10         0G      2.578      1.696      1.345         54        320: 100%|██████████| 379/379 [21:04<00:
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 48/48 [

                   all        473      20174      0.295      0.168      0.123      0.046






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/10         0G      2.469      1.612      1.307         70        320: 100%|██████████| 379/379 [20:57<00:
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 48/48 [

                   all        473      20174      0.339      0.178      0.155     0.0611






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/10         0G       2.36      1.492      1.265         28        320: 100%|██████████| 379/379 [20:53<00:
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 48/48 [

                   all        473      20174       0.41       0.21      0.195     0.0848






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/10         0G      2.269      1.412      1.225        188        320: 100%|██████████| 379/379 [21:21<00:
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 48/48 [01:30<00:00,  1.89s/it]

                   all        473      20174      0.422      0.237       0.22     0.0973






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/10         0G      2.154      1.342      1.192        107        320: 100%|██████████| 379/379 [21:34<00:00,  3.42s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 48/48 [01:29<00:00,  1.86s/it]

                   all        473      20174      0.433      0.244      0.224      0.102






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/10         0G      2.117      1.293      1.173         27        320: 100%|██████████| 379/379 [21:28<00:
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 48/48 [

                   all        473      20174      0.446      0.255      0.233      0.106






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/10         0G      2.029      1.237      1.147         48        320: 100%|██████████| 379/379 [21:27<00:
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 48/48 [

                   all        473      20174      0.481      0.262      0.254      0.116






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/10         0G      1.966      1.184      1.138         85        320:  66%|██████▌   | 249/379 [14:17<07:

In [12]:
# Set the Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

# Load the trained model with the best weights
model = YOLO('C:\Project\\vscode\SSY340-text-extraction-project/best.pt')

# Define the function to draw bounding boxes and labels on the image
def draw_boxes(image, boxes, confidences, class_ids, class_names):
    extracted_texts = []
    for box, confidence, class_id in zip(boxes, confidences, class_ids):
        x1, y1, x2, y2 = [int(coord) for coord in box]
        
        # Extract the region of interest (ROI) for OCR
        roi = image[y1:y2, x1:x2]
        
        # Use Tesseract to extract text from the ROI
        extracted_text = pytesseract.image_to_string(roi, config='--psm 6').strip()  # Adjust psm mode as needed
        
        label = f"{extracted_text} {confidence:.2f}"
        extracted_texts.append((extracted_text, confidence))
        
        # Draw the bounding box
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Draw the label
        font_scale = 0.5
        font = cv2.FONT_HERSHEY_SIMPLEX
        (text_width, text_height) = cv2.getTextSize(label, font, fontScale=font_scale, thickness=1)[0]
        text_offset_x = x1
        text_offset_y = y1 - 5
        box_coords = ((text_offset_x, text_offset_y), (text_offset_x + text_width + 2, text_offset_y - text_height - 2))
        cv2.rectangle(image, box_coords[0], box_coords[1], (0, 255, 0), cv2.FILLED)
        cv2.putText(image, label, (text_offset_x, text_offset_y), font, fontScale=font_scale, color=(0, 0, 0), thickness=1)

    return image, extracted_texts

# Define the function to calculate CER and WER
def calculate_error_rates(reference, hypothesis):
    cer = editdistance.eval(reference, hypothesis) / len(reference) if len(reference) > 0 else 0
    wer = editdistance.eval(reference.split(), hypothesis.split()) / len(reference.split()) if len(reference.split()) > 0 else 0
    return cer, wer

# Define the function to perform inference, extract text, and calculate error rates
def infer_and_show(model, image_path, reference_text):
    # Load the image
    image = cv2.imread(image_path)
    assert image is not None, f"Error: Unable to load image at {image_path}"

    # Perform inference
    results = model(image)
    
    # Extract the results
    boxes = results[0].boxes.xyxy.cpu().numpy()  # Bounding boxes in xyxy format
    confidences = results[0].boxes.conf.cpu().numpy()  # Confidence scores
    class_ids = results[0].boxes.cls.cpu().numpy()  # Class IDs
    class_names = results[0].names  # Class names

    # Draw bounding boxes and labels on the image, and extract texts
    annotated_image, extracted_texts = draw_boxes(image, boxes, confidences, class_ids, class_names)

    # Show the image
    cv2.imshow('Inference', annotated_image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

    # Print the extracted texts in a table format
    print("\nExtracted Texts:")
    headers = ["Extracted Text", "Confidence"]
    print(tabulate(extracted_texts, headers=headers, tablefmt="pretty"))

    # Calculate CER and WER
    extracted_text_combined = " ".join(text[0] for text in extracted_texts)
    cer, wer = calculate_error_rates(reference_text, extracted_text_combined)
    print(f"\nCharacter Error Rate (CER): {cer:.2f}")
    print(f"Word Error Rate (WER): {wer:.2f}")

# Path to the image you want to perform inference on
#image_path = 'TextOCR/train_val_images/train_images/0af5d7dc935d0563.jpg'
image_path = 'TextOCR/train_val_images/train_images/0af5d7dc935d0563.jpg'
# Reference text for calculating CER and WER (this should be the ground truth text for the image)
reference_text = "your ground truth text here"

# Perform inference, extract text, and calculate error rates
infer_and_show(model, image_path, reference_text)




0: 256x320 14 texts, 292.9ms
Speed: 0.0ms preprocess, 292.9ms inference, 0.0ms postprocess per image at shape (1, 3, 256, 320)

Extracted Texts:
+----------------+---------------------+
| Extracted Text |     Confidence      |
+----------------+---------------------+
|      NEWS      | 0.7008996605873108  |
|    WERLD'S     | 0.6780672669410706  |
|                | 0.6735836267471313  |
|      woop      | 0.6520718336105347  |
|     SATUF      | 0.6499629616737366  |
|     GREEN.     | 0.5873026251792908  |
|                | 0.5679686069488525  |
|    Evening     | 0.5316756367683411  |
|       UU       | 0.4927869737148285  |
|     AORST      | 0.47362199425697327 |
|    HOCKER’     | 0.44166478514671326 |
| om A Se eee =  | 0.4334736466407776  |
|                | 0.3711823523044586  |
|     Con Og     | 0.26255112886428833 |
|  ~—caariisoy   |                     |
+----------------+---------------------+

Character Error Rate (CER): 3.07
Word Error Rate (WER): 3.40


In [4]:
NUM_SAMPLES = 2
# Set the Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

# Load the trained model with the best weights
model = YOLO('best.pt')

# Define the function to draw bounding boxes and labels on the image
def draw_boxes(image, boxes, confidences, class_ids, class_names):
    extracted_texts = []
    for box, confidence, class_id in zip(boxes, confidences, class_ids):
        x1, y1, x2, y2 = [int(coord) for coord in box]
        
        # Extract the region of interest (ROI) for OCR
        roi = image[y1:y2, x1:x2]
        
        # Use Tesseract to extract text from the ROI
        extracted_text = pytesseract.image_to_string(roi, config='--psm 6').strip()  # Adjust psm mode as needed
        
        label = f"{extracted_text} {confidence:.2f}"
        extracted_texts.append(extracted_text)
        
        # Draw the bounding box
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Draw the label
        font_scale = 0.5
        font = cv2.FONT_HERSHEY_SIMPLEX
        (text_width, text_height) = cv2.getTextSize(label, font, fontScale=font_scale, thickness=1)[0]
        text_offset_x = x1
        text_offset_y = y1 - 5
        box_coords = ((text_offset_x, text_offset_y), (text_offset_x + text_width + 2, text_offset_y - text_height - 2))
        cv2.rectangle(image, box_coords[0], box_coords[1], (0, 255, 0), cv2.FILLED)
        cv2.putText(image, label, (text_offset_x, text_offset_y), font, fontScale=font_scale, color=(0, 0, 0), thickness=1)

    return image, extracted_texts

# Define the function to calculate CER and WER
def calculate_error_rates(reference, hypothesis):
    cer = editdistance.eval(reference, hypothesis) / len(reference) if len(reference) > 0 else 0
    wer = editdistance.eval(reference.split(), hypothesis.split()) / len(reference.split()) if len(reference.split()) > 0 else 0
    return cer, wer

# Define the function to perform inference, extract text, and calculate error rates for a single image
def infer_and_show(model, image_path):
    # Load the image
    image = cv2.imread(image_path)
    assert image is not None, f"Error: Unable to load image at {image_path}"

    # Perform inference
    results = model(image)
    
    # Extract the results
    boxes = results[0].boxes.xyxy.cpu().numpy()  # Bounding boxes in xyxy format
    confidences = results[0].boxes.conf.cpu().numpy()  # Confidence scores
    class_ids = results[0].boxes.cls.cpu().numpy()  # Class IDs
    class_names = results[0].names  # Class names

    # Draw bounding boxes and labels on the image, and extract texts
    annotated_image, extracted_texts = draw_boxes(image, boxes, confidences, class_ids, class_names)

    # Combine extracted texts
    extracted_text_combined = " ".join(extracted_texts)

    return annotated_image, extracted_text_combined

# Define the function to process the test dataset
def process_test_dataset(model, test_ids, annot_df, num_samples=3):
    # Initialize lists to store CER and WER for each image
    cer_list = []
    wer_list = []
    images_to_display = []

    # Randomly select sample images for display
    sample_ids = random.sample(list(test_ids), num_samples)

    # Process each image in the test dataset
    for image_id in test_ids:
        img_path = os.path.join(DATA_DIR, f'{image_id}.jpg')
        reference_text = " ".join(annot_df[annot_df['image_id'] == image_id]['utf8_string'].tolist())

        # Perform inference and extract text
        annotated_image, extracted_text = infer_and_show(model, img_path)
        
        # Calculate CER and WER
        cer, wer = calculate_error_rates(reference_text, extracted_text)
        cer_list.append(cer)
        wer_list.append(wer)
        
        # Store the image to display later if it's in the sample_ids
        if image_id in sample_ids:
            images_to_display.append((annotated_image, image_id))

        # Print the extracted text and error rates for each image
        print(f"\nImage: {image_id}")
        print(f"Reference Text: {reference_text}")
        print(f"Extracted Text: {extracted_text}")
        print(f"Character Error Rate (CER): {cer:.2f}")
        print(f"Word Error Rate (WER): {wer:.2f}")
		    # Display the sample images
    fig, axes = plt.subplots(1, num_samples, figsize=(15, 5))
    for ax, (image, image_id) in zip(axes, images_to_display):
        ax.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        ax.set_title(f"Image ID: {image_id}")
        ax.axis('off')
    plt.show()

    # Calculate and print overall CER and WER
    overall_cer = sum(cer_list) / len(cer_list) if cer_list else 0
    overall_wer = sum(wer_list) / len(wer_list) if wer_list else 0

    # Print the final results in a table format
    headers = ["Metric", "Value"]
    results = [
        ["Overall Character Error Rate (CER)", f"{overall_cer:.2f}"],
        ["Overall Word Error Rate (WER)", f"{overall_wer:.2f}"]
    ]

    print("\nFinal Results:")
    print(tabulate(results, headers=headers, tablefmt="pretty"))

# Process the test dataset and calculate error rates
process_test_dataset(model, small_test_ids, annot_df, num_samples=NUM_SAMPLES)

NameError: name 'small_test_ids' is not defined

In [16]:
import logging
# Set the Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

model1 = YOLO('runs/detect/train2/weights/best.pt')

def recognize_text(model, dataset, conf_threshold=0.25):
    detected_texts = []  # Initialize list to store detected texts
    
    for img_index, (img, _) in enumerate(dataset):
        # Convert the tensor to a numpy array if necessary
        img_np = img.permute(1, 2, 0).numpy()  # Assuming img is a PyTorch tensor
        logging.info(f"Processing image {img_index} with shape: {img_np.shape}")
        
        # Perform detection using the model
        results = model(img_np)  # Call the model directly
        
        # Log the results
        logging.info(f"Image {img_index} results: {results}")

        # Check if there are detections
        if not results or len(results[0].boxes.xyxy) == 0:
            logging.warning(f"No detections for image {img_index}")
            detected_texts.append([])  # Append empty list for this image
            continue  # Skip to the next image

        # Parse the results
        bboxes = results[0].boxes.xyxy.numpy()  # Get bounding boxes (x1, y1, x2, y2)
        logging.info(f"Detected bounding boxes for image {img_index}: {bboxes}")

        texts = []
        for bbox in bboxes:  # Iterate through each detected bounding box
            text = extract_text_from_bbox(img_np, bbox)
            if text:  # Only append if text is not empty
                texts.append(text)
        
        detected_texts.append(texts)
    
    return detected_texts