# Text Post Processing


### Setting up


In [1]:
import cv2
import pytesseract
import re
from collections import defaultdict
from datetime import datetime
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelWithLMHead,
    T5Tokenizer,
    T5ForConditionalGeneration,
)

  from .autonotebook import tqdm as notebook_tqdm


### Preprocessing the Text File

### - Removing duplicates

### - sorting lines (if they have spatial or logical sequence)

### - Merging fragments into sentences or paragraphs

### - Handling common OCR errors, like misrecognized characters


In [None]:
def clean_text_line(line):
    """clean a single line of text by removing unwanted characters."""
    return re.sub(r"[^a-zA-Z0-9.,!? ]", "", line.strip())


def preprocess_text_file(input_file, output_file):
    # preprocess the collected text into clean, deduplicated sentences
    with open(input_file, "r") as file:
        lines = file.readlines()

    # extract and clean text
    text_lines = [clean_text_line(line.split("] ", 1)[-1]) for line in lines]

    # remove duplicates and empty lines
    unique_lines = list(filter(None, sorted(set(text_lines))))

    # save processed lines to the output files
    with open(output_file, "w") as file:
        file.write("\n".join(unique_lines))

    return unique_lines


# preprocess_text_file("detected_text.txt", "final_output.txt")

In [None]:
def merge_fragments_with_local_maxima(fragments):
    """Simple rule-based merging of overlapping fragments."""
    """
    Stitch fragments by prioritizing the longest fragments and avoiding redundancy.
    """

    # Group fragments by their starting word
    fragment_groups = defaultdict(list)
    for fragment in fragments:
        starting_word = fragment.split()[0] if fragment else ""
        fragment_groups[starting_word].append(fragment)

    stitched_sentence = []
    for _, group in fragment_groups.items():
        # Sort fragments in the group by length (descending)
        largest_fragment = max(group, key=len)

        # Add the largest fragment if it's not redundant
        if not any(largest_fragment in stitched for stitched in stitched_sentence):
            stitched_sentence.append(largest_fragment)

    # Join the selected fragments into a coherent sentence
    return " ".join(stitched_sentence)

In [None]:
# Example fragments
fragments = [
    "ARE HEALED",
    "ARE HEALED OR",
    "ARE YOU",
    "ARE YOU HEALED OR",
    "JUST ISOLATED WITH NO",
    "JUST ISOLATED",
    "ONE TO TRIGGER YOU?",
    "ONE TO",
]


with open("final_output.txt", "r") as file:
    fragments = file.readlines()


final_sentence = merge_fragments_with_local_maxima(fragments)
# print("Final Sentence:", final_sentence)

summarizer = pipeline("summarization", device=0)

summary = summarizer(final_sentence, max_length=52,
                     min_length=25, do_sample=False)

print("Summary:", summary[0]["summary_text"])

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Summary:  The Daily Meditations with Darilyn Amick will feature daily Meditations from around the world . Amick is the author of a book called The Meditations of the World .


In [None]:
# Example list of fragments
# fragments = ["ARE HEALED", "ARE YOU HEALED", "ISOLATED WITH NO", "ONE TO TRIGGER YOU?"]
fragments = [
    "ARE HEALED",
    "ARE HEALED OR",
    "ARE YOU",
    "ARE YOU HEALED OR",
    "JUST ISOLATED WITH NO",
    "JUST ISOLATED",
    "ONE TO TRIGGER YOU?",
    "ONE TO",
]

# Flatten and split each fragment into words
flattened_list = [word for fragment in fragments for word in fragment.split()]

# Print the result
print(flattened_list)

['ARE', 'HEALED', 'ARE', 'HEALED', 'OR', 'ARE', 'YOU', 'ARE', 'YOU', 'HEALED', 'OR', 'JUST', 'ISOLATED', 'WITH', 'NO', 'JUST', 'ISOLATED', 'ONE', 'TO', 'TRIGGER', 'YOU?', 'ONE', 'TO']


In [None]:
fragments = [
    "ARE HEALED",
    "ARE HEALED OR",
    "ARE OR",
    "ARE YOU",
    "ARE YOU HEALED",
    "ARE YOU HEALED OR",
    "HEALED",
    "HEALED OR",
    "HEALEO",
    "IBOLATED",
    "ISOLATED",
    "ISOLATED WITH",
    "ISOLATED WITH NO",
    "JUST ISOLATED",
    "JUST ISOLATED WITH",
    "JUST ISOLATED WITH NO",
    "JUST NO",
    "JUST WITH NO",
    "ONE TO",
    "ONE TO TRIGGER",
    "ONE TO TRIGGER YOU?",
    "ONE TO YOU?",
    "ONE TRIGGER YOU?",
    "ONE YOU?",
    "TO TRIGGER",
    "TO TRIGGER YOU?",
    "TRIGGER",
    "TRIGGER YOU?",
    "TRIGOER",
    "WITH NO",
    "WITHNG",
    "WITHNO",
    "YOU HEALED",
    "YOU HEALED OR",
]

In [None]:
# fragments = [
#     "ARE HEALED",
#     "ARE HEALED OR",
#     "ARE YOU",
#     "ARE YOU HEALED OR",
#     "JUST ISOLATED WITH NO",
#     "JUST ISOLATED",
#     "ONE TO TRIGGER YOU?",
# ]

In [None]:
flattened_string = " ".join(fragments)
print(flattened_string)

ARE HEALED ARE HEALED OR ARE OR ARE YOU ARE YOU HEALED ARE YOU HEALED OR HEALED HEALED OR HEALEO IBOLATED ISOLATED ISOLATED WITH ISOLATED WITH NO JUST ISOLATED JUST ISOLATED WITH JUST ISOLATED WITH NO JUST NO JUST WITH NO ONE TO ONE TO TRIGGER ONE TO TRIGGER YOU? ONE TO YOU? ONE TRIGGER YOU? ONE YOU? TO TRIGGER TO TRIGGER YOU? TRIGGER TRIGGER YOU? TRIGOER WITH NO WITHNG WITHNO YOU HEALED YOU HEALED OR


## Other models


In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "mrm8488/t5-base-finetuned-common_gen")


model = AutoModelWithLMHead.from_pretrained(
    "mrm8488/t5-base-finetuned-common_gen")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [65]:
def gen_sentence(words, max_length=55):
    input_text = words
    features = tokenizer([input_text], return_tensors="pt")

    output = model.generate(
        input_ids=features["input_ids"],
        attention_mask=features["attention_mask"],
        max_length=max_length,
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


words = "tree plant ground hole dig"

gen_sentence(words)

'digging a hole in the ground to plant trees'

In [None]:
# removing duplicates
unique_flattened_string = " ".join(dict.fromkeys(flattened_string.split()))
print(unique_flattened_string)

ARE HEALED OR YOU HEALEO IBOLATED ISOLATED WITH NO JUST ONE TO TRIGGER YOU? TRIGOER WITHNG WITHNO


In [67]:

final_answer = gen_sentence(unique_flattened_string)
print(final_answer)

ARE YOU HEALLED OR YOU ARE ISOLATED WITH NO JUST A TRIGGER TO USE?


In [63]:
from pybraille import convertText

braille_text = convertText(final_answer)
print(braille_text)
print(type(braille_text))

⠠⠁⠠⠗⠠⠑ ⠠⠽⠠⠕⠠⠥ ⠠⠓⠠⠑⠠⠁⠠⠇⠠⠇⠠⠑⠠⠙ ⠠⠕⠠⠗ ⠠⠽⠠⠕⠠⠥ ⠠⠁⠠⠗⠠⠑ ⠠⠊⠠⠎⠠⠕⠠⠇⠠⠁⠠⠞⠠⠑⠠⠙ ⠠⠺⠠⠊⠠⠞⠠⠓ ⠠⠝⠠⠕ ⠠⠚⠠⠥⠠⠎⠠⠞ ⠠⠁ ⠠⠞⠠⠗⠠⠊⠠⠛⠠⠛⠠⠑⠠⠗ ⠠⠞⠠⠕ ⠠⠥⠠⠎⠠⠑⠦
<class 'str'>
