# Explore here

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import os
import opendatasets as od
import zipfile
import tensorflow as tf
from pathlib import Path
import shutil
import random
import hashlib
import cv2
from matplotlib import pyplot as plt
import re
import ultralytics
from ultralytics import YOLO

First of all, we download the files.

In [None]:
# Assign the Kaggle data set URL into variable
dataset = "https://www.kaggle.com/datasets/youthamj/captchaobjectdetection"
# Using opendatasets let's download the data sets
od.download(dataset, data_dir="../data/raw/", force=True)

Now we calculate the number of files donwloaded.

In [None]:
data_dir = "../data/raw/captchaobjectdetection"

print(sum(1 for file in Path(data_dir).iterdir() if file.is_file()))

Then, we calculate the number of images (.png extension) and text files (.txt extension)

In [None]:
print(sum(1 for im in Path(data_dir).glob("*.png")))
print(sum(1 for im in Path(data_dir).glob("*.txt")))

We can observe that the number of images and files differ by one since there are 200,001 files in total, 100.000 images and 100.001 text files. Therefore, we have one extra text file aside from those associated with each image.

The extra file is all_sequences.txt, that contains the relationship between the name of the image/text file and the characters that contains.

We will use this file to asociate the characters in each row to the number asociated to it in the YOLO files.

The content of the text files can be duplicated since there could be two different images with the same characters to decipher, so we only check there are no duplicates in our images.

In [None]:
def hashfile(file_path):
    hasher = hashlib.sha256()
    with open(file_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()


hashes = pd.DataFrame(columns=["filename","hash"])
hashes_dup = pd.DataFrame(columns=["filename","hash"])

for filename in Path(data_dir).glob("*.png"):
  file_hash = hashfile(filename)
  if (hashes["hash"] != file_hash).all():
    hashes.loc[len(hashes)] = [filename, file_hash]
  else:
    hashes_dup.loc[len(hashes_dup)] = [filename, file_hash]

if hashes_dup.empty:
  print("No duplicates in the dataset.")
else:
  print(hashes_dup)



Let's check then an inmage, and it's bounding boxes, to see everything goes as expected in the YOLO files.

Now, we check as an example a random image and its bounding box, to check the values are defined as expected.

In [None]:
def plot_img(img_dir, txt_dir):
    image = cv2.imread(img_dir)
    image_hight = image.shape[0]
    image_width = image.shape[1]

    with open(txt_dir, 'r', encoding='utf-8') as f:
        text_list = [list(map(float, line.strip().split())) for line in f]

    for i in range(len(text_list)):
        x0 = text_list[i][1] - text_list[1][3] / 2
        x1 = text_list[i][1] + text_list[i][3] / 2
        y0 = text_list[i][2] - text_list[i][4] / 2
        y1 = text_list[i][2] + text_list[i][4] / 2

        start_point = (int(x0*image_width), int(y0*image_hight))
        end_point = (int(x1*image_width), int(y1*image_hight))

        img = cv2.rectangle(image, start_point, end_point, color=(255, 0, 0), thickness=2)

    plt.imshow(img)

In [None]:
img_dir = data_dir + "/10084.png"
txt_dir = data_dir + "/10084.txt"

plot_img(img_dir, txt_dir)


And the most important thing, to extract the relationship between the characters of every image, and the class of the YOLO files associated to it (the base to construct the classes of our dataset.yaml file).

In [None]:
files_all = "../data/raw/captchaobjectdetection/all_sequences.txt"

chars_per_img = []
with open(files_all, "r", encoding="utf-8") as fa:
    chars_per_img = [line.strip().split(",") for line in fa]

files = "../data/raw/captchaobjectdetection/"
char_files = pd.DataFrame(columns=["char","class"])

for file in Path(files).glob("*.txt"):
    if file.is_file() and file.name != "all_sequences.txt":
        with open(file, "r", encoding="utf-8") as f:  
            values = [line.strip().split()[0] for line in f]

        filename = file.stem  
        
        chars = next((pair[1] for pair in chars_per_img if pair[0] == filename), [])

        temp_df = pd.DataFrame({'char': values, 'class': list(chars)})
        char_files = pd.concat([char_files, temp_df], ignore_index=True)

print(char_files.value_counts().reset_index().sort_values("class", ascending=True))

We noticed that there are some characters missing: iIlLoO01
It makes sense, so these are characters so easy to confuse them.

Finally, it's time to create the folders to divide the files in txt and images, and then split then in 3 subsets of train, validation and test, in a rate of 60-20-20.

In [None]:
os.makedirs("../data/raw/images")
os.makedirs("../data/raw/labels")

In [None]:
for im in Path("data_dir").glob("*.png"):
    shutil.move(im, os.path.join("../data/raw/images", im.name))

for im in Path("data_dir").glob("*.txt"):
    shutil.move(im, os.path.join("../data/raw/labels", im.name))

for im in Path("../data/raw/labels").glob("all_sequences.txt"):
    shutil.move(im, os.path.join("data_dir", im.name))

In [None]:
os.makedirs("../data/raw/images/train")
os.makedirs("../data/raw/images/val")
os.makedirs("../data/raw/images/test")

os.makedirs("../data/raw/labels/train")
os.makedirs("../data/raw/labels/val")
os.makedirs("../data/raw/labels/test")

In [None]:
origin = "../data/raw/images"
destiny_train = "../data/raw/images/train"
destiny_val = "../data/raw/images/val"
destiny_test = "../data/raw/images/test"

files = [f for f in os.listdir(origin) if os.path.isfile(os.path.join(origin, f))]
    
size_to_move = int(len(files) * (20 / 100))
    
files_to_move_val = random.sample(files, size_to_move)

for file in files_to_move_val:
    shutil.move(os.path.join(origin, file), os.path.join(destiny_val, file))

files = [f for f in os.listdir(origin) if os.path.isfile(os.path.join(origin, f))]
files_to_move_test = random.sample(files, size_to_move)

for file in files_to_move_test:
    shutil.move(os.path.join(origin, file), os.path.join(destiny_test, file))

for im in Path("../data/raw/images").glob("*.png"):
    shutil.move(im, os.path.join("../data/raw/images/train", im.name))

In [12]:
destiny_train_im = "../data/raw/images/train"
destiny_val_im = "../data/raw/images/val"
destiny_test_im = "../data/raw/images/test"

origin_txt = "../data/raw/labels"
destiny_train_txt = "../data/raw/labels/train"
destiny_val_txt = "../data/raw/labels/val"
destiny_test_txt = "../data/raw/labels/test"

im_train = {os.path.splitext(f)[0] for f in os.listdir(destiny_train_im) if os.path.isfile(os.path.join(destiny_train_im, f))}
im_val = {os.path.splitext(f)[0] for f in os.listdir(destiny_val_im) if os.path.isfile(os.path.join(destiny_val_im, f))}
im_test = {os.path.splitext(f)[0] for f in os.listdir(destiny_test_im) if os.path.isfile(os.path.join(destiny_test_im, f))}

for file in os.listdir(origin_txt):
    path = os.path.join(origin_txt, file)

    if os.path.isfile(path):
        root, _ = os.path.splitext(file)

        # Si el nombre coincide con los nombres de la carpeta de imágenes, moverlo
        if root in im_train:
            shutil.move(path, os.path.join(destiny_train_txt, file))
        
        elif root in im_val:
            shutil.move(path, os.path.join(destiny_val_txt, file))

        elif root in im_test:
            shutil.move(path, os.path.join(destiny_test_txt, file))

Let's check the folders have the number of files expected.

In [None]:
print(len(os.listdir(destiny_train_im)))
print(len(os.listdir(destiny_val_im)))
print(len(os.listdir(destiny_test_im)))

print(len(os.listdir(destiny_train_txt)))
print(len(os.listdir(destiny_val_txt)))
print(len(os.listdir(destiny_test_txt)))

Let's begin with the YOLO model.

In [None]:
model = YOLO("yolov11x.pt")

First of all, we train the model with our dataset.

In [None]:
results = model.train(
    data="./dataset.yaml",
    epochs=50,
    imgsz=640,
    batch=-1,
    device=0
)

When we try to train the model, we can see the following error:

val: WARNING  ..\captcha-processor\data\raw\images\val\9998.png: ignoring corrupt image/label: non-normalized or out of bounds coordinates [     1.0333]
train: WARNING  ..\captcha-processor\data\raw\images\train\99960.png: ignoring corrupt image/label: non-normalized or out of bounds coordinates [     1.0333]

It seems that thousands of the YOLO files have at least, one value bigger than 1, wich is not valid, due to value normalization.

Let's copy some of this images and txt in the original folder, soy we can see whats happenning, and what can we do.

In [None]:
img_dir = "../data/interim/10193.png"
txt_dir = "../data/interim/10193.txt"

plot_img(img_dir, txt_dir)

Let's compare the results whith the YOLO file normalized (replacing any value bigger than one, with one).

In [None]:
img_dir = "../data/interim/10193.png"
txt_dir = "../data/interim/10193_normalized.txt"

plot_img(img_dir, txt_dir)

We can notice that the rectangles stay the same, but the values are now valid in YOLO files, so they have no values outside 0 to 1.

So, the key now is to replace every value bigger with one, to one, in every text file all along the YOLO files.

First of all in this step is to create a back up of the original txt.

In [None]:
shutil.copytree("../data/raw/labels/", "../data/raw/captchaobjectdetection", dirs_exist_ok=True)

And then, modify the files to normalize values.

In [None]:
dir = "../data/raw/labels"

for file in Path(dir).rglob("*.txt"):   
    with open(file, "r+", encoding="utf-8") as f:
        lines = f.readlines()
        f.seek(0)  
        for line in lines:
            values = list(map(str, line.strip().split()))
            values_to_change = ["1" if float(x) > 1 else x for x in values[1:]]
            f.write(" ".join([values[0]] + values_to_change) + "\n")
        f.truncate()

Let's try again the YOLO training, in order to see if our changes normalizing the txt file values have solved the problem.

In [None]:
results = model.train(
    data="./dataset.yaml",
    epochs=50,
    imgsz=640,
    batch=-1,
    device=0
)

Then, we save the model in onnx format

In [None]:
model.export(format="onnx")

and validate it.

In [None]:
results_val = model.val(data="./dataset.yaml")

Finally, we create some functions that processes the image using the YOLO model, and shows the boxes and the classes it detects.

In [None]:
def capcha_prediction(final_results, names):
  """
  Predicts the CAPTCHA text from object detection results by extracting and sorting character detections.

  Parameters:
  final_results (list): A list of detection results, where each result contains bounding boxes and class IDs.
  names (list): A list of class names corresponding to detected character indices.

  Returns:
  str: The predicted CAPTCHA text based on detected characters sorted from left to right.
  """
  detection_string = ""
  for result in final_results:
      boxes = result.boxes.xyxy
      class_ids = result.boxes.cls

      detections = sorted(zip(boxes, class_ids), key=lambda x: x[0][0])
   
      for box, class_id in detections:
          detection_string += f"{names[int(class_id)]}"

  return detection_string


def captcha_boxes_prediction(final_results, image_path):
  """
  Draws bounding boxes around detected CAPTCHA characters in an image.

  Parameters:
  final_results (list): A list of detection results containing bounding boxes.
  image_path (str): Path to the image file.

  Returns:
  numpy.ndarray: The image with drawn bounding boxes.
  """
  image = cv2.imread(image_path)
  image_hight = image.shape[0]
  image_width = image.shape[1]

  for result in final_results:
    boxes = result.boxes.xyxy

    for box in boxes:
      x0, y0, x1, y1 = map(int, box)  # Convertir a enteros
      img = cv2.rectangle(image, (x0, y0), (x1, y1), (255, 255, 0), 2)

  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
  plt.imshow(img)

Let's check it out!

In [3]:
model_final = YOLO("./runs/detect/train2/weights/best.pt")

In [4]:
test_image_path = "../data/raw/images/test/65.png"

In [None]:
results = model_final(test_image_path)

In [None]:
print(capcha_prediction(results, model_final.names))

In [None]:
captcha_boxes_prediction(results,test_image_path)

It works!!!