In [None]:
from google.colab import drive
drive.mount('/content/drive')

*English below*

Aqui estão as regras de validação para cada campo, sem justificativas, e as regras de `append` também incluídas:

### Regras de Validação

1. **PERMISSÃO**:
   - Deve ser `"PERMISSÃO"` ou estar vazio.

2. **ACC**:
   - Deve ser `"AUTORIZADO"` ou estar vazio.

3. **VÁLIDA EM TODO**:
   - Deve ser um número (`isdigit()`) e `width < height`.

4. **Nº REGISTRO**:
   - Deve ser um número (`isdigit()`) e `width > height`.

5. **CPF**:
   - Deve seguir o formato `xxx.xxx.xxx-xx` (regex: `\d{3}\.\d{3}\.\d{3}-\d{2}`).

6. **CAT. HAB.**:
   - Deve ter no máximo duas letras e não pode estar na lista de estados brasileiros.

7. **1ª HABILITAÇÃO, DATA NASCIMENTO, VALIDADE**:
   - Deve conter uma data no formato `dd/mm/aaaa` (regex: `\d{2}/\d{2}/\d{4}`).

8. **DOC. IDENTIDADE/ÓRG EMISSOR/UF**:
   - Deve ser um número, ou duas letras de um estado brasileiro, ou ter 3 ou 4 caracteres ou o valor deve contar letras e números.

9. **FILIAÇÃO**:
   - Deve conter apenas letras, exceto a palavra `"PERMISSÃO"`, e ter mais de 3 caracteres.

### Regras de Append

1. **FILIAÇÃO**:
   - Todas as transcrições válidas são adicionadas.

2. **DOC. IDENTIDADE/ÓRG EMISSOR UF** ou **DOC. IDENTIDADE/ÓRG EMISSOR/UF**:
   - Se o valor contiver letras e números, o candidato mais próximo é adicionado.
   - Caso contrário, até 3 transcrições válidas são adicionadas.

3. **Outros Candidatos**:
   - O candidato válido mais próximo é adicionado, se não estiver duplicado.

Here are the validation rules for each field, without justifications, including the append rules:

## Validation Rules:

1. **PERMISSÃO**:
   - Must be `"PERMISSÃO"` or empty.

2. **ACC**:
   - Must be `"AUTORIZADO"` or empty.

3. **VÁLIDA EM TODO**:
   - Must be a number (`isdigit()`) and `width < height`.

4. **Nº REGISTRO**:
   - Must be a number (`isdigit()`) and `width > height`.

5. **CPF**:
   - Must follow the format `xxx.xxx.xxx-xx` (regex: `\d{3}\.\d{3}\.\d{3}-\d{2}`).

6. **CAT. HAB.**:
   - Must have a maximum of two letters and cannot be in the list of Brazilian states.

7. **1ª HABILITAÇÃO, DATA NASCIMENTO, VALIDADE**:
   - Must contain a date in the format `dd/mm/yyyy` (regex: `\d{2}/\d{2}/\d{4}`).

8. **DOC. IDENTIDADE/ÓRG EMISSOR/UF**:
   - Must be a number, or two letters of a Brazilian state, or have 3 or 4 characters, or contain both letters and numbers.

9. **FILIAÇÃO**:
   - Must contain only letters, except for the word `"PERMISSÃO"`, and have more than 3 characters.

## Append Rules

1. **FILIAÇÃO**:
   - All valid transcriptions are added.

2. **DOC. IDENTIDADE/ÓRG EMISSOR UF** or **DOC. IDENTIDADE/ÓRG EMISSOR/UF**:
   - If the value contains both letters and numbers, the closest candidate is added.
   - Otherwise, up to 3 valid transcriptions are added.

3. **Other Candidates**:
   - The closest valid candidate is added if not duplicated.


In [None]:
import math
import os
import re

label_list = [
    'NOME', 'CPF', 'PERMISSÃO', 'ACC', 'CAT. HAB.',
    '1ª HABILITAÇÃO', 'VALIDADE', 'Nº REGISTRO', 'VÁLIDA EM TODO', 'DATA NASCIMENTO',
    'DOC. IDENTIDADE/ÓRG EMISSOR/UF', 'DOC. IDENTIDADE/ÓRG EMISSOR UF', 'DOC IDENTIDADE/ÓRG EMISSOR/UF', 'FILIAÇÃO'
]

brazilian_states = [
    "AC", "AL", "AP", "AM", "BA", "CE", "DF", "ES", "GO", "MA",
    "MT", "MS", "MG", "PA", "PB", "PR", "PE", "PI", "RJ", "RN",
    "RS", "RO", "RR", "SC", "SP", "SE", "TO"
]

def convert_to_tuples(conteudo):
    boxes = []
    lines = conteudo.strip().split('\n')

    for line in lines[1:]:
        values = line.split(', ')

        # Checks if the line starts with '[', indicating the alternative format
        if line.startswith('['):
            parts = line.replace('[', '').replace(']', '').split(', ')
            x = int(parts[0])
            y = int(parts[1])
            width = int(parts[2]) - x
            height = int(parts[3]) - y
            transcription = parts[-1]
            boxes.append((x, y, width, height, transcription))
        else:
            # Original format with 5 values
            if len(values) == 5:
                x = int(values[0])
                y = int(values[1])
                width = int(values[2])
                height = int(values[3])
                transcription = values[4]
                boxes.append((x, y, width, height, transcription))

    return boxes


def validate_transcription(label, transcription, width, height):
    if label == 'PERMISSÃO':
        return transcription == "PERMISSÃO"
    elif label == 'ACC':
        return transcription == "AUTORIZADO"
    elif label == 'VÁLIDA EM TODO':
        return transcription.isdigit() and width < height
    elif label == 'Nº REGISTRO':
        return transcription.isdigit() and width > height
    elif label == 'CPF':
        return bool(re.match(r'\d{3}\.\d{3}\.\d{3}-\d{2}', transcription))
    elif label == 'CAT. HAB.':
        return len(transcription) <= 2 and transcription not in brazilian_states
    elif label in ['1ª HABILITAÇÃO', 'DATA NASCIMENTO', 'VALIDADE']:
        return bool(re.match(r'\d{2}/\d{2}/\d{4}', transcription))
    elif label in ['DOC. IDENTIDADE/ÓRG EMISSOR UF', 'DOC. IDENTIDADE/ÓRG EMISSOR/UF']:
        return (
            transcription.isdigit() or
            (len(transcription) == 2 and transcription in brazilian_states) or
            len(transcription) == 3 or
            len(transcription) == 4 or
            (any(c.isalpha() for c in transcription) and any(c.isdigit() for c in transcription) and all(c.isalnum() or c.isspace() for c in transcription))
        )
    elif label == 'FILIAÇÃO':
        return transcription.replace(" ", "").isalpha() and "PERMISSÃO" not in transcription and len(transcription) > 3
    return True

def calculate_distance(box1, box2):
    x1, y1 = box1[0], box1[1]
    x2, y2 = box2[0], box2[1]
    return math.sqrt((x2 - x1)**2 + (y2 - y1)**2)

def sort_by_label(label_list, boxes):
    result = []
    added_transcriptions = set()
    added_labels = set()
    birth_date = None
    first_validates_in_all = None
    second_validates_in_all = None
    validates_in_all_boxes = [box for box in boxes if box[4] == 'VÁLIDA EM TODO']
    validity_box = next((box for box in boxes if box[4] == 'VALIDADE'), None)
    mirror_value = None

    if len(validates_in_all_boxes) == 2 and not validity_box:
        first_validates_in_all, second_validates_in_all = sorted(validates_in_all_boxes, key=lambda box: box[1])

    for label in label_list:
        box_label = next((box for box in boxes if box[4] == label and label not in added_labels), None)
        if box_label:
            added_labels.add(label)

            candidates_number = []
            for box in boxes:
                if re.match(r'^\d{9}$', box[4]) and box[4] not in added_transcriptions and (box[0] < box[1] or box[0] == -1):
                    candidates_number.append(box)

            if candidates_number:
                mirror_value = candidates_number[0]

            if label == 'DATA NASCIMENTO':
                birth_date = box_label

            if label == 'VÁLIDA EM TODO':
                if mirror_value:
                   result.append(box_label)
                   result.append(mirror_value)
                   added_transcriptions.add(mirror_value[4])

                elif candidates_number and not second_validates_in_all:
                    candidate_next_number = min(candidates_number, key=lambda box: calculate_distance(box_label, box))
                    result.append(candidate_next_number)
                    added_transcriptions.add(candidate_next_number[4])

            if not validity_box and first_validates_in_all and second_validates_in_all and label == 'VÁLIDA EM TODO':
                candidates_date = [
                    box for box in boxes
                    if re.match(r'\d{2}/\d{2}/\d{4}', box[4]) and box[4] not in added_transcriptions
                ]
                if candidates_date:
                    candidate_next_date = min(candidates_date, key=lambda box: calculate_distance(first_validates_in_all, box))
                    result.append((first_validates_in_all[0], first_validates_in_all[1], first_validates_in_all[2], first_validates_in_all[3], 'VALIDADE'))
                    result.append(candidate_next_date)
                    added_transcriptions.add('VALIDADE')

            elif label == 'PERMISSÃO':
                candidate = [box for box in boxes if box[4] == "PERMISSÃO" and box[4] not in added_transcriptions]
                if len(candidate) == 2:
                    candidate_ordenados = sorted(candidate, key=lambda box: box[1])
                    result.extend(candidate_ordenados)
                    added_transcriptions.update(box[4] for box in candidate_ordenados)
            else:
                candidate = [box for box in boxes if box[4] not in label_list and box[4] not in added_transcriptions]
                if candidate and box_label not in added_transcriptions:
                    valid_candidates = [box for box in candidate if validate_transcription(label, box[4], box[2], box[3])]

                    if label == 'FILIAÇÃO' and valid_candidates:
                        result.append(box_label)
                        result.extend(valid_candidates)
                    elif label in ['DOC. IDENTIDADE/ÓRG EMISSOR UF', 'DOC. IDENTIDADE/ÓRG EMISSOR/UF', 'DOC IDENTIDADE/ÓRG EMISSOR/UF'] and valid_candidates:
                        added_line = False
                        for candidates in valid_candidates:
                              if any(c.isalpha() for c in candidates[4]) and any(c.isdigit() for c in candidates[4]):
                                  valid_candidates = [box for box in valid_candidates if box[4] not in added_transcriptions]
                                  if valid_candidates and not added_line:
                                      candidates_next = min(valid_candidates, key=lambda box: calculate_distance(box_label, box))
                                      result.append(box_label)
                                      result.append(candidates_next)
                                      added_transcriptions.add(candidates_next[4])
                                      added_line = True
                                      break
                        if not added_line:
                              valid_candidates = [box for box in valid_candidates if box[4] not in added_transcriptions][:3]
                              result.append(box_label)
                              result.extend(valid_candidates)
                              added_transcriptions.update(box[4] for box in valid_candidates)
                    elif valid_candidates:
                        valid_candidates = [box for box in valid_candidates if box[4] not in added_transcriptions]
                        if valid_candidates:
                            candidates_next = min(valid_candidates, key=lambda box: calculate_distance(box_label, box))
                            result.append(box_label)
                            result.append(candidates_next)
                            added_transcriptions.add(candidates_next[4])

    if not result and birth_date:
        result.append(birth_date)

    return result

def save_sorted_to_file(new_content, file_name):
    boxes = convert_to_tuples(new_content)
    ordered_result = sort_by_label(label_list, boxes)
    new_folder = '/content/drive/MyDrive/news_gt'
    if not os.path.exists(new_folder):
        os.makedirs(new_folder)

    file_name_output = os.path.join(new_folder, f'{os.path.splitext(os.path.basename(file_name))[0]}.txt')

    with open(file_name_output, 'w') as f:
        for box in ordered_result:
            x, y, width, height, transcription = box
            f.write(f"{x}, {y}, {width}, {height}, {transcription}\n")

    print(f"File saved in: {file_name_output}")

def process_folder():
    folder = '/content/drive/MyDrive/BD/BID Dataset/CNH_Frente'
    # txt_files = ['00006815_gt_ocr.txt']
    txt_files = [f for f in os.listdir(folder) if f.endswith('.txt')]

    for file in txt_files:
        file_path = os.path.join(folder, file)

        with open(file_path, 'r', encoding='latin1') as f:
            content = f.readlines()

        ignore_list = [
            'REPÚBLICA FEDERATIVA DO BRASIL', 'MINISTÉRIO DAS CIDADES',
            'DEPARTAMENTO NACIONAL DE TRÂNSITO', 'CARTEIRA NACIONAL DE HABILITAÇÃO',
            'O TERRITÓRIO NACIONAL', 'REPÚBLICA FEDERATIVA DO BRA',
        ]
        filtered_lines = [line for line in content if not any(ignore in line for ignore in ignore_list)]
        new_content = ''.join(filtered_lines)

        save_sorted_to_file(new_content, file_path)

process_folder()

In [None]:
import os

folder_txt = '/content/drive/MyDrive/news_gt'

absent = {label: [] for label in label_list}
present = {label: [] for label in label_list}

# Function to check the labels in the file, reading the fifth column
def check_labels(arquivo_txt):
    labels_found = set()

    with open(arquivo_txt, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(',')
            if len(parts) == 5:
                label = parts[4].strip()
                labels_found.add(label)

    return labels_found

# Function to extract code from file (part before '_gt_ocr.txt')
def extract_code(file_name):
    if file_name.endswith('_gt_ocr.txt'):
        return file_name.split('_')[0]
    return None

# Iterating over the txt files in the folder
for file_name in os.listdir(folder_txt):
    if file_name.endswith('.txt'):
        file_path = os.path.join(folder_txt, file_name)
        labels_found = check_labels(file_path)
        file_code = extract_code(file_name)

        for label in label_list:
            if label == 'DOC. IDENTIDADE/ÓRG EMISSOR/UF':
                if 'DOC. IDENTIDADE/ÓRG EMISSOR/UF' in labels_found or 'DOC. IDENTIDADE/ÓRG EMISSOR UF' in labels_found or 'DOC IDENTIDADE/ÓRG EMISSOR/UF' in labels_found:
                    present[label].append(file_code)
                else:
                    absent[label].append(file_code)
            else:
                if label in labels_found:
                    present[label].append(file_code)
                else:
                    absent[label].append(file_code)

for label in label_list:
    print(f"Label '{label}':")
    print(f"Total number of movies that have the label: {len(present[label])}")
    print(f"    File codes: {present[label] if present[label] else 'No file contains this label.'}")
    print(f"  Total number of files missing the label: {len(absent[label])}")
    print(f"    File codes: {absent[label] if absent[label] else 'No files are missing this label.'}")
    print("-" * 50)
