# Download Dataset

In [1]:
# Download ICDAR 2003 dataset
# unzip -q Dataset/icdar2003.zip -d dataset

# **Prepare dataset to train for text detection**

In [2]:
import xml.etree.ElementTree as ET

word_xml_path = "SceneTrialTrain/words.xml"
tree = ET.parse(word_xml_path)
root = tree.getroot()

print(type(root))
print(root)

<class 'xml.etree.ElementTree.Element'>
<Element 'tagset' at 0x104180c20>


In [3]:
for img in root:
    print("Image name", img[0].text)
    print("Image size ", img[1].attrib["x"], img[1].attrib["y"])
    
    

Image name apanar_06.08.2002/IMG_1261.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1263.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1265.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1269.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1281.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1282.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1283.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1284.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1285.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1286.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1288.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1289.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1290.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1291.JPG
Image size  1600 1200
Image name apanar_06.08.2002/IMG_1292.JPG
Image size  1598 922
Image name apanar_06.08.2002/IMG_1293.JPG

In [4]:
img = root[0]

for im in img.findall('taggedRectangles') :
    for bb in im.findall('taggedRectangle'):
        print(bb.attrib['x'])
        print(bb[0].text)

174.0
self
512.0
adhesive
184.0
address
863.0
labels
72.0
36
247.0
89m
792.0
cls
200.0
250
473.0
on
684.0
a
806.0
roll


In [5]:
import os
def extract_data_from_xml(root_dir):
    xml_path = os.path.join(root_dir, "words.xml")
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    image_path = []
    image_size = []
    image_label = []
    image_bb = []

    for img in root:
        bbs_of_img = []
        labels_of_img = []

        image_path.append(img[0].text)
        image_size.append((int(img[1].attrib['x']), 
                           int(img[1].attrib['y'])))
        
        for bbs in img.findall('taggedRectangles'):
            for bb in bbs:
                
                if not bb[0].text.isalnum():
                    continue
                if "é" in bb[0].text.lower() or "ñ" in bb[0].text.lower():
                    continue

                bbs_of_img.append(
                    [
                        float(bb.attrib['x']),
                        float(bb.attrib['y']),
                        float(bb.attrib['width']),
                        float(bb.attrib['height'])
                    ]
                )
                labels_of_img.append(bb[0].text.lower())
        image_label.append(labels_of_img)
        image_bb.append(bbs_of_img)
            
    return image_path, image_size, image_label, image_bb

In [6]:
data_dir = "SceneTrialTrain"
image_path, image_size, image_label, image_boxes = extract_data_from_xml(data_dir)

print("Image path: ", image_path[0])
print("Image size: ", image_size[0])
print("Image label: ", image_label[0])
print("Image boxes: ", image_boxes[0])

Image path:  apanar_06.08.2002/IMG_1261.JPG
Image size:  (1600, 1200)
Image label:  ['self', 'adhesive', 'address', 'labels', '36', '89m', 'cls', '250', 'on', 'a', 'roll']
Image boxes:  [[174.0, 392.0, 274.0, 195.0], [512.0, 391.0, 679.0, 183.0], [184.0, 612.0, 622.0, 174.0], [863.0, 599.0, 446.0, 187.0], [72.0, 6.0, 95.0, 87.0], [247.0, 2.0, 197.0, 88.0], [792.0, 0.0, 115.0, 81.0], [200.0, 848.0, 228.0, 139.0], [473.0, 878.0, 165.0, 109.0], [684.0, 878.0, 71.0, 106.0], [806.0, 844.0, 218.0, 141.0]]


# **Convert to yolo format**

In [7]:
def convert_to_yolo_format(image_paths, image_sizes, image_bboxes):
    class_id = 0
    yolo_data = []

    for image_path, image_size, image_bbox in zip(image_paths, image_sizes, image_bboxes):
        yolo_label = []
        w, h = image_size

        for bb in image_bbox:
            x, y, width, height = bb
            center_x = (x + width / 2) / w
            center_y = (y + height /2) / h
            width = width / w
            height = height / h

            label = f"{class_id} {center_x} {center_y} {width} {height}"
            yolo_label.append(label)
        
        yolo_data.append((image_path, yolo_label))

    return yolo_data

yolo_data = convert_to_yolo_format(image_path, image_size, image_boxes)     
print(yolo_data)
            

[('apanar_06.08.2002/IMG_1261.JPG', ['0 0.194375 0.40791666666666665 0.17125 0.1625', '0 0.5321875 0.40208333333333335 0.424375 0.1525', '0 0.309375 0.5825 0.38875 0.145', '0 0.67875 0.5770833333333333 0.27875 0.15583333333333332', '0 0.0746875 0.04125 0.059375 0.0725', '0 0.2159375 0.03833333333333333 0.123125 0.07333333333333333', '0 0.5309375 0.03375 0.071875 0.0675', '0 0.19625 0.7645833333333333 0.1425 0.11583333333333333', '0 0.3471875 0.7770833333333333 0.103125 0.09083333333333334', '0 0.4496875 0.7758333333333334 0.044375 0.08833333333333333', '0 0.571875 0.7620833333333333 0.13625 0.1175']), ('apanar_06.08.2002/IMG_1263.JPG', ['0 0.5346875 0.6008333333333333 0.529375 0.14333333333333334']), ('apanar_06.08.2002/IMG_1265.JPG', ['0 0.566875 0.21875 0.31375 0.0675', '0 0.56 0.22375 0.3375 0.025833333333333333', '0 0.5709375 0.28458333333333335 0.255625 0.0575', '0 0.5771875 0.33708333333333335 0.165625 0.060833333333333336', '0 0.7378125 0.8020833333333334 0.035625 0.015833333333

# **Define save data function by yolo format**

In [8]:
import shutil
def save_data(data, src_image_dir, save_image_dir):
    os.makedirs(save_image_dir, exist_ok=True)

    os.makedirs(os.path.join(save_image_dir, "images"), exist_ok=True)
    os.makedirs(os.path.join(save_image_dir, "labels"), exist_ok=True)

    for image_path, yolo_data in data:
        shutil.copy(
            os.path.join(src_image_dir, image_path),
            os.path.join(save_image_dir, "images")
        )

        image_name = os.path.basename(image_path)
        image_name = os.path.splitext(image_name)[0]
        with open(os.path.join(save_image_dir, "labels" , f"{image_name}.txt"), 'w') as f:
            for label in yolo_data:
                f.write(f"{label}\n")




# **Split data to trainset, valset and testset**

In [9]:
from sklearn.model_selection import train_test_split

seed = 0
val_size = 0.2
test_size = 0.125
is_shuffle = True

train, test = train_test_split(
    yolo_data,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle
)

test, val = train_test_split(
    test,
    test_size=test_size,
    random_state=seed,
    shuffle=is_shuffle
)



save_image_path = "../datasets/yolo_data"

save_train_path = os.path.join(save_image_path, "train")
save_val_path = os.path.join(save_image_path, "val")
save_test_path = os.path.join(save_image_path, "test")


save_data(train, data_dir, save_train_path)
save_data(val, data_dir, save_val_path)
save_data(test, data_dir, save_test_path)


# **Create yaml file**

In [10]:
import yaml

data_yaml = {
    "path" : "./yolo_data",
    "train" : "train/images",
    "val" : "val/images",
    "test" : "test/images",
    "nc" : 1,
    "names" : ["text"],
}

yaml_yolo = os.path.join(save_image_path, "data.yml")
print(yaml_yolo)
with open(yaml_yolo, 'w') as f:
    yaml.dump(data_yaml, f, default_flow_style=False)

../datasets/yolo_data/data.yml


# **Prepare datasets to train for text recognition**

In [32]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image

In [46]:
def split_bounding_boxes(image_paths, image_labels, image_bbs, root_dir, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    count = 0
    labels = []
    for image_path, image_label, image_bb in zip(image_paths, image_labels, image_bbs):
        image = Image.open(os.path.join(root_dir, image_path))

        for label, bb in zip(image_label, image_bb):
            image_cropped = image.crop((bb[0], bb[1], bb[0] + bb[2], bb[1] + bb[3]))
            print(np.mean(image_cropped))
            print(image_cropped.size)

            if np.mean(image_cropped) < 35 or np.mean(image_cropped) > 220:
                continue

            if image_cropped.size[0] < 10 or image_cropped.size[1] < 10:
                continue
            
            file_name = f"{count:06d}.jpg"
            image_cropped.save(os.path.join(save_dir, file_name))
            new_path = os.path.join(save_dir, file_name)
            label = new_path + "\t" + label
            labels.append(label)

            count += 1
    print(f"Create {count} images")

    with open(os.path.join(save_dir, "labels.txt"), 'w') as f:
        for label in labels:
            f.write(f"{label}\n")
    


split_bounding_boxes(image_paths=image_path, image_labels=image_label, image_bbs=image_boxes, root_dir=data_dir, save_dir="dataset_text_recognition")

179.74366460789818
(274, 195)
174.3938906191737
(679, 183)
168.40684788902442
(622, 174)
160.04178956539812
(446, 187)
200.70364992942126
(95, 87)
229.7518843254884
(197, 88)
238.98457684737878
(115, 81)
208.23240355084354
(228, 139)
205.55492540079695
(165, 109)
197.1346886349544
(71, 106)
210.4765328475069
(218, 141)
172.5817728782845
(847, 172)
167.66635515550965
(502, 81)
169.0295499800876
(540, 31)
155.38473713428533
(409, 69)
161.01171706728698
(265, 73)
160.42351492767006
(57, 19)
135.28183421516755
(63, 15)
155.9980694980695
(111, 14)
159.51071428571427
(70, 16)
152.62282282282283
(74, 15)
128.28710089399743
(145, 18)
128.42938468992247
(64, 43)
123.4662568306011
(61, 40)
175.6451133407655
(138, 52)
176.71166276346605
(350, 61)
160.43034722222222
(64, 75)
176.21172516803585
(103, 52)
164.99279388647648
(363, 59)
183.9180614394518
(374, 77)
183.0517496853814
(644, 102)
178.4957264957265
(99, 26)
187.4148148148148
(85, 27)
184.22317188983857
(54, 26)
179.11131313131312
(132, 25)


-----
$$ Prepare  data  finish$$