In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


*English below*

## Regras de Validação (PT)

1. **Número com mais dígitos (somente números)**  
   - Deve conter **apenas dígitos** (`text.isdigit()`).
   - Deve ser o texto numérico com **mais dígitos** entre os candidatos.

2. **Alfanumérico (letras e números)**  
   - Deve conter **ao menos uma letra** e **ao menos um número**.
   - Exemplo: `"A123456"`, `"12AB34"`.

3. **Número com menos dígitos (somente números)**  
   - Deve conter **apenas dígitos** (`text.isdigit()`).
   - Deve ser o texto numérico com **menos dígitos** entre os candidatos restantes (após selecionar o da linha 2).

4. **LOCAL**  
   - Deve conter um **hífen (`-`)**.
   - **Não** deve conter `"DETRAN"`.
   - **Não** deve conter dígitos.

5. **DATA EMISSÃO**  
   - Deve estar no formato `dd/mm/aaaa`.  
   - Expressão regular: `\d{2}/\d{2}/\d{4}`.

6. **ASSINATURA DO EMISSOR**  
   - Deve estar **abaixo da metade da imagem** (`y > image_height // 2`).
   - Deve conter **apenas letras** (sem dígitos ou hífens).
   - Pode conter espaços.

7. **OBSERVAÇÕES**  
   - Deve estar **acima da metade da imagem** (`y <= image_height // 2`).

8. **Candidato mais próximo**  
   - Se nenhum valor válido for encontrado para os campos anteriores, selecionar o **mais próximo** com base na menor **distância euclidiana** ao campo de referência.

---

## Validation Rules (EN)

1. **Number with most digits (only numbers)**  
   - Must contain **only digits** (`text.isdigit()`).
   - Should be the numeric text with the **most digits** among candidates.

2. **Alphanumeric (letters and numbers)**  
   - Must contain **at least one letter** and **at least one digit**.
   - Example: `"A123456"`, `"12AB34"`.

3. **Number with fewer digits (only numbers)**  
   - Must contain **only digits** (`text.isdigit()`).
   - Should be the numeric text with the **fewest digits** among remaining candidates (after picking line 2).

4. **LOCAL**  
   - Must contain a **hyphen (`-`)**.
   - **Must not** contain `"DETRAN"`.
   - **Must not** contain digits.

5. **DATA EMISSÃO (ISSUE DATE)**  
   - Must follow the format `dd/mm/yyyy`.  
   - Regex: `\d{2}/\d{2}/\d{4}`.

6. **ASSINATURA DO EMISSOR (ISSUER'S SIGNATURE)**  
   - Must be **below half the image** (`y > image_height // 2`).
   - Must contain **only letters** (no digits or hyphens).
   - Can include spaces.

7. **OBSERVAÇÕES (OBSERVATIONS)**  
   - Must be **above half the image** (`y <= image_height // 2`).

8. **Closest candidate**  
   - If no valid value is found for previous fields, select the **closest** one based on the **shortest Euclidean distance** to the reference field.



In [None]:
import math
import os
import re

label_list = [
    'LOCAL', 'DATA EMISSÃO', 'ASSINATURA DO EMISSOR', 'OBSERVAÇÕES'
]

def convert_to_tuples(conteudo):
    boxes = []
    lines = conteudo.strip().split('\n')

    for line in lines[1:]:
        values = line.split(', ')
        if line.startswith('['):
            parts = line.replace('[', '').replace(']', '').split(', ')
            x = int(parts[0])
            y = int(parts[1])
            width = int(parts[2]) - x
            height = int(parts[3]) - y
            transcription = parts[-1]
            boxes.append((x, y, width, height, transcription))

        else:
            if len(values) == 5:
                x = int(values[0])
                y = int(values[1])
                width = int(values[2])
                height = int(values[3])
                transcription = values[4]
                boxes.append((x, y, width, height, transcription))

    return boxes

def validate_transcription(label, transcription, width, height, y, image_height):
    transcription = transcription.strip()

    if label == 'LOCAL':
        return  '-' in transcription and 'DETRAN' not in transcription and not any(char.isdigit() for char in transcription)

    elif label == 'DATA EMISSÃO':
        return bool(re.fullmatch(r'\d{2}/\d{2}/\d{4}', transcription))

    elif label == 'ASSINATURA DO EMISSOR':
        return (
            y > image_height // 2
            and transcription.replace(" ", "").isalpha()
            and '-' not in transcription
            and not any(char.isdigit() for char in transcription)
        )

    elif label == 'OBSERVAÇÕES':
        return y <= image_height // 2

    return True

def calculate_distance(box1, box2):
    x1, y1 = box1[0], box1[1]
    x2, y2 = box2[0], box2[1]
    return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

def sort_by_label(label_list, boxes):
    result = []
    added_transcriptions = set()
    added_labels = set()

    if not boxes:
        return result

    image_height = max([y + h for (_, y, _, h, _) in boxes])

    for label in label_list:
        box_label = next((box for box in boxes if box[4] == label and label not in added_labels), None)
        if box_label:
            added_labels.add(label)

            candidates = [
                box for box in boxes
                if box[4] not in label_list
                and box[4] not in added_transcriptions
                and validate_transcription(label, box[4], box[2], box[3], box[1], image_height)
            ]

            if label == 'ASSINATURA DO EMISSOR':
                candidates = sorted(candidates, key=lambda b: b[1])
                result.append(box_label)
                for c in candidates:
                    result.append(c)
                    added_transcriptions.add(c[4])

            elif label == 'OBSERVAÇÕES':
                candidates = sorted(candidates, key=lambda b: (b[1], b[0]))
                result.append(box_label)
                for c in candidates:
                    result.append(c)
                    added_transcriptions.add(c[4])

            else:
                if candidates:
                    closest = min(candidates, key=lambda b: calculate_distance(box_label, b))
                    result.append(box_label)
                    result.append(closest)
                    added_transcriptions.add(closest[4])
                else:
                    result.append(box_label)

    return result

def save_sorted_to_file(new_content, file_name):
    boxes = convert_to_tuples(new_content)
    ordered_result = sort_by_label(label_list, boxes)
    new_folder = '/content/drive/MyDrive/news_gt_back'
    if not os.path.exists(new_folder):
        os.makedirs(new_folder)

    file_name_output = os.path.join(new_folder, f'{os.path.splitext(os.path.basename(file_name))[0]}.txt')

    alphanum_box = None
    digit_boxes = []

    for box in boxes:
        x, y, width, height, transcription = box
        text = transcription.strip()

        if text.isdigit():
            digit_boxes.append((len(text), box))
        elif any(c.isalpha() for c in text) and any(c.isdigit() for c in text):
            if not alphanum_box:
                alphanum_box = box

    digit_boxes = sorted(digit_boxes, key=lambda x: x[0], reverse=True)
    only_digits_more = digit_boxes[0][1] if len(digit_boxes) > 0 else None
    only_digits_less = digit_boxes[1][1] if len(digit_boxes) > 1 else None

    for b in [only_digits_more, alphanum_box, only_digits_less]:
        if b in ordered_result:
            ordered_result.remove(b)

    final_result = []
    if only_digits_more:
        final_result.append(only_digits_more)
    if alphanum_box:
        final_result.append(alphanum_box)
    if only_digits_less:
        final_result.append(only_digits_less)

    final_result.extend(ordered_result)

    with open(file_name_output, 'w') as f:
        for box in final_result:
            x, y, width, height, transcription = box
            f.write(f"{x}, {y}, {width}, {height}, {transcription}\n")

    print(f"File saved in: {file_name_output}")



def process_folder():
    folder = '/content/drive/MyDrive/BD/BID Dataset/CNH_Verso'
    txt_files = [f for f in os.listdir(folder) if f.endswith('.txt')]

    for file in txt_files:
        file_path = os.path.join(folder, file)

        with open(file_path, 'r', encoding='latin1') as f:
            content = f.readlines()

        ignore_list = ['PROIBIDO PLASTIFICAR', 'ASSINATURA DO PORTADOR', 'DEPARTAMENTO NACIONAL DE TRÂNSITO', 'DETRAN']
        filtered_lines = [line for line in content if not any(ignore in line for ignore in ignore_list)]
        new_content = ''.join(filtered_lines)

        save_sorted_to_file(new_content, file_path)

process_folder()


File saved in: /content/drive/MyDrive/news_gt_back/00010519_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010528_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010520_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010522_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010525_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010523_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010534_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010533_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010530_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010532_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010531_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010535_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010541_gt_ocr.txt
File saved in: /content/drive/MyDrive/news_gt_back/00010542_gt_ocr.txt
File s

In [3]:
import os

folder_path = '/content/drive/MyDrive/news_gt_back/'
txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

print(f"Total de arquivos .txt: {len(txt_files)}")


Total de arquivos .txt: 3600
