In [2]:
import transformers
import torch
from transformers import AutoTokenizer
import gzip
import json
from random import randint
import unicodedata
import re

#from eval_metrics import calculate_metrics
from Bio.Align import PairwiseAligner

In [None]:
from eval_metrics import calculate_metrics
import optuna
import datetime as dt 

In [3]:
def align(input_text, output_text):
    aligner = PairwiseAligner()
    aligner.mode = 'global'
    aligner.target_end_gap_score = 0.0
    aligner.query_end_gap_score = 0.0
    aligner.open_gap_score = -1  
    aligner.extend_gap_score = -0.5 # trying out stuff
    alignments = aligner.align(input_text, output_text)
    alignment = alignments[0]
    return alignment

def read_data(fname, max_examples=None):
    examples = []
    with open(fname, "rt", encoding="utf-8") as f: #gzip.
        for line in f:
            example = json.loads(line)
            examples.append(example)
            if max_examples and len(examples) >= max_examples:
               break
    return examples

def sliding_window(tokens, window_size, prompt_size=0):
    start = randint(prompt_size, max(prompt_size, tokens["input_ids"].size()[1] - window_size))
    truncated_tokens = tokens["input_ids"][:,start:start+window_size]
    truncated_attention_mask = tokens["attention_mask"][:,start:start+window_size]
    return { "input_ids" : truncated_tokens.to(device), "attention_mask" : truncated_attention_mask.to(device)}


#examples = read_data("/scratch/project_2000539/jenna/ocr-correction/by_page_dev_slim.jsonl.gz")

#outputs = read_data("/scratch/project_2005072/cassandra/ocr-postcorrection-lm/out.jsonl")
#tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat", padding_side="left")
#tokenizer.pad_token = tokenizer.eos_token


In [50]:
def extract_aligned_text(text_for_extraction, alignment, with_indices=False, i_start=0, i_end=0):       #extract the matching part of the text based of alignment
    #print(str(alignment).split('\n'))
    base_text, alignment_string, query_text = alignment[0], craft_alignment_string(alignment), alignment[1]
    
    if not with_indices:
        matches = list(re.finditer(r'\|+', alignment_string))
        
        last_match = max(matches, key=lambda m: m.end())
        first_match = min(matches, key=lambda m: m.start())
        start, end = first_match.start(), last_match.end()
    elif with_indices:
        start, end = i_start, i_end

    character_start = base_text[start]
    while character_start=="-":
        character_start = base_text[start+1]
        start+=1
    character_end = base_text[end]
    while character_end=="-":
        character_end = base_text[end-1]
        end-=1
        
    start_total = 0
    end_total = 0

    spaces_count_in_aligned = 0                    #taking into account the leading spaces added by the alignment (sometimes?)
    for char in base_text.replace("-", ""):        #doesn't matter that we replace with nothing, it's temporary and we only look at the leading spaces
        if char == ' ':
            spaces_count_in_aligned += 1
        else:
            break

    spaces_count_in_extraction_text = 0
    for char in text_for_extraction:
        if char == ' ':
            spaces_count_in_extraction_text += 1
        else:
            break

    diff = spaces_count_in_aligned - spaces_count_in_extraction_text
    #print("diff in spaces:" , diff)

    diff=max(diff, 0)
    start_total = base_text[diff:start+1].count(character_start)  #trying to get the rank of a character
    end_total = base_text[diff:end+1].count(character_end)    
        
    start_count, end_count = 0, 0
    index_start, index_end = 0, 0

    #print("character_start:",character_start, "start_total:", start_total,
     #     "character_end:", character_end, "end_total:", end_total)

    for i in range(len(text_for_extraction)):          #basically just counting how many times the characters appear to try and find the position it was in the original text based on rank
        if start_count==start_total&end_count==end_total:
            break
        if end_count<end_total:
            if text_for_extraction[i]==character_end:
                end_count+=1
                index_end=i
        if start_count<start_total:
            if text_for_extraction[i]==character_start:
                start_count+=1
                index_start=i

    if end_count!=end_total:     #happens when the ending character is "-". we just then return the whole text. Should not happend anymore 
        index_end = -1
    #print("startcount", start_count)
    #aligned_text = base_text[index_start:index_end]
    return index_start, index_end 

def find_longest_sequence_of_ones(smoothed_scores_list):  #gpt inspired function
    max_length = 0
    current_length = 0
    start_index = 0
    best_start_index = -1
    best_end_index = -1
    
    for i, value in enumerate(smoothed_scores_list):
        if value == 1:
            if current_length == 0:
                start_index = i 
            current_length += 1
        else:
            if current_length > max_length:
                max_length = current_length
                best_start_index = start_index
                best_end_index = i - 1
            current_length = 0

    #check if the longest sequence ends at the last element
    if current_length > max_length:
        max_length = current_length
        best_start_index = start_index
        best_end_index = len(data) - 1

    #print(f"The longest sequence of 1s starts at index {best_start_index} and ends at index {best_end_index}.")
    
    return best_start_index, best_end_index

def craft_alignment_string(alignment):   #function to overcome formatting discrepancies
    aligned_sequence1 = alignment[0]
    aligned_sequence2 = alignment[1]
    
    alignment_string=""
    
    for idx, char in enumerate(aligned_sequence1):
        if char==aligned_sequence2[idx]:
            alignment_string+="|"
        elif char=="-" or aligned_sequence2[idx]=="-":
            alignment_string+="-"
        else:
            alignment_string+="."

    return alignment_string


def smoothing_window(text_for_extraction, alignment, smoothing_range=10, minimal_score=0.8):  #averaging the surrounding scores to try and spot the "dense" places of alignment
    base_text, alignment_string, query_text = alignment[0], craft_alignment_string(alignment), alignment[1]

    n = len(alignment_string)
    left_side_smoothing_scores = [0 for i in range(n)]
    right_side_smoothing_scores = [0 for i in range(n)]

    #going from left to right
    for i in range(n):
        window = alignment_string[i-smoothing_range:i]
        score = window.count("|")/smoothing_range
        left_side_smoothing_scores[i] = score

    #going from right to left 
    for i in reversed(range(n)):
        window = alignment_string[i:i+smoothing_range]
        score = window.count("|")/smoothing_range
        right_side_smoothing_scores[i] = score

    smoothed_scores = [max(left_side_smoothing_scores[i], right_side_smoothing_scores[i]) for i in range(n)]
    is_good_enough = []
    for elt in smoothed_scores:
        if elt>=minimal_score:
            is_good_enough.append(1)
        else:
            is_good_enough.append(0)

    start_smooth_alignment, end_smooth_alignment = find_longest_sequence_of_ones(is_good_enough)

    #extracting the text for evaluation
    start, end = extract_aligned_text(text_for_extraction, alignment, with_indices=True, i_start=start_smooth_alignment, i_end=end_smooth_alignment)

    return start, end

def normalize(text):
    text = " ".join(text.replace("\n", " ").split())
    return unicodedata.normalize("NFKC", text)


In [32]:
a = """ as to composers in the fifteenth century; for not only the madrigals that were invented after the new notation were at that time printed, but many of the old ones were made to assume this more per- sect form, and, therefore, are preserved even to this day. "; Su7mer is Icumen, a celebrated madrigal for fix voices, the manuscript of which is now in the British Museum, was composed about 1460. SKEL- TON, in the reign of HENRY the seventh, wrote songs, which were composed in parts by CORNISI1, and many others might be mentioned. FRANCHINUS, who wrote a work which wa~ printed at MILAN, gives some of the first examples for the improvement of musical notation, but these characters were cut out in blocks; the Gernans, however, improved upon this practise, and that art seems to have arrived to something like perfection about the year 1500, so that this improvement seemed ready for the use it was pur to afterwards in ENGLAND; but it came to no perfection till about 1560, when a very indultrious man, of the name of JOHN DAY, published the Church Service in four and three parts. IIis labours were a good deal ac- celerated by STERIIOLD) and HOPKINS; who, in addition to the novelty of introducing their New Version of the Psalms, brought forth the Cantiones of "'ALLIS and BIRD, two names of fulIicicnt con- YOKERS, who, in their turn, introduced the notation of the modern music, which was afterwards used by the English, and which is now in general use. The first book of this notation was published in 1558, and the last in 1578. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1585, contains the harmonies of the psalms, and the fifth, which was published in 1590, contains the music of the com- positions of the two above-mentioned authors. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The sixth book, which was published in 1590, contains the music of the compositions of ALLIS and BIRD. The first book of the New Version of the Psalms was published in 1562, and the last in 1575. The first book contains the rules of the notation, and the second and third books contain the music. The fourth book, which was published in 1578, contains the harmonies of the psalms, and the fifth, which was published in 1585, contains the music of the compositions of STERIOLD and HOPKINS. The six Icumen, a celebrated madrigal for fix voices, the manuscript of which is now in the British Museum, was composed about 1460. SKEL- TON, in the reign of HENRY the seventh, wrote songs, which were composed in parts by CORNISI1, and many others might be mentioned. FRANCHINUS, who wrote a work which wa~ printed at MILAN, gives some of the first examples for the improvement of musical notation, but these characters were cut out in blocks; the Gernans, however, improved upon this practise, and that art seems to have arrived to something like perfection about the year 1500, so that this improvement seemed ready for the use it was pur to afterwards in ENGLAND; but it came to no perfection till about 1560, when a very indultrious man, of the name of JOHN DAY, published the Church Service in four and three parts. IIis labours were a good deal ac- celerated by"""

In [33]:
b = """ certainty as to compoſers in the fifteenth century; for not only the madrigals that were invented after the new notation were at that time printed, but many of the old ones were made to aſſume this more per-fect form, and, therefore, are preſerved even to this day. "Sumer is Icumen, a celebrated madrigal for ſix voices, the manuſcript of which is now in the Britiſh Muſeum, was compoſed about 1460. SKEL-TON, in the reign of HENRY the ſeventh, wrote ſongs, which were compoſed in parts by CORNISH, and many others might be mentioned. FRANCHINUS, who wrote a work which was printed at MILAN, gives ſome of the firſt examples for the improvement of muſical notation, but theſe characters were cut out in blocks; the Germans, however, improved upon this practiſe, and that art ſeems to have arrived to ſomething like perfection about the year 1500, ſo that this improvement ſeemed ready for the uſe it was put to afterwards in ENGLAND; but it came to no perfection till about 1560, when a very induſtrious man, of the name of JOHN DAY, publiſhed the Church Service in four and three parts. His labours were a good deal ac-celerated by STERHOLD and HOPKINS; who, in addition to the novelty of introducing their New Verſion of the Pſalms, brought forth the Cantiones of TALLIS and BIRD, two names of ſufficient con-ſequence"""

In [52]:
al = align(normalize(a),normalize(b))

start, end = smoothing_window(a, al)

#print(al)
print("\n")
print("result:")
print(a[start:end])
print("\n")



#print(start, end)
al = align(normalize(b),normalize(a[start:end]))

start2, end2 = smoothing_window(b, al)

start2+=2 #if minimal score hasn't been touched
end2-=2



#start2+=len("but so it goes. I also Sung")
#end2+=len("but so it goes. I also Sung")

#print(al[0], craft_alignment_string(al), al[1])
print("\n")
print("result:")
print(b[start2:end2])



result:
as to composers in the fifteenth century; for not only the madrigals that were invented after the new notation were at that time printed, but many of the old ones were made to assume this more per- sect form, and, therefore, are preserved even to this day. "; Su7mer is Icumen, a celebrated madrigal for fix voices, the manuscript of which is now in the British Museum, was composed about 1460. SKEL- TON, in the reign of HENRY the seventh, wrote songs, which were composed in parts by CORNISI1, and many others might be mentioned. FRANCHINUS, who wrote a work which wa~ printed at MILAN, gives some of the first examples for the improvement of musical notation, but these characters were cut out in blocks; the Gernans, however, improved upon this practise, and that art seems to have arrived to something like perfection about the year 1500, so that this improvement seemed ready for the use it was pur to afterwards in ENGLAND; but it came to no perfection till about 1560, when a very i

In [8]:
test = "---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||----------------------------"

In [36]:
count=0
for char in test:
    if char=="|":
        break
    count+=1
print(count)

print(b[183:])

183
by GUIDERUS and ARVIRAGUS over  F I D E L E, supposed to be dead.  By the Same.  I. T O fair Fidele's grassy tomb  Soft maids, and village hinds lhall bring  Each op'ning fieet, of earliert bloom, And rifle all the breathing Spring.  II.  No wailing ghost shall dare appear  To vex with shrieks this quiet grove:  But Ihepherd lads assemble here, And melting harps their notes shall move.


In [17]:
#al=align(input_text=outputs[7]["output"].replace("\n", " "), output_text=examples[1]["output"].replace("\n", " "))


a = al[0]
b = al[1]

c=""

for idx, char in enumerate(a):
    if char==b[idx]:
        c+="|"
    elif char=="-" or b[idx]=="-":
        c+="-"
    else:
        c+="."

print(c==str(al).split("\n")[1])
print(str(al))

True
yself to one poem per poet—which means that the impetus for this list actually gets bumped for the widely quoted (and misunderstood) “The Road Not Taken,” but so it goes. I also Sung by GUIDERUS and ARVIRAGUS over F I D E L E, supposed to be dead. By the Same. I. T O fair Fidele's grassy tomb Soft maids, and village hinds lhall bring Each op'ning fieet, of earliert bloom, And rifle all the breathing Spring. II. No wailing ghost shall dare appear To vex with shrieks this quiet grove: But Ihepherd lads assemble here, And melting harps their notes shall move.
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

In [5]:
test1 = " A SONG FROM SHAKESPEAR's CYMBELINE. Sung by GUIDERUS and ARVIRAGUS over FIDELE, ſuppoſed to be dead. By the Same. I. TO fair Fidele's graſſy tomb Soft maids, and village hinds ſhall bring Each op'ning ſweet, of earlieſt bloom, And rifle all the breathing Spring. II. No wailing ghoſt ſhall dare appear To vex with ſhrieks this quiet grove: But ſhepherd lads aſſemble here, And melting virgins own their love. "

In [7]:
test2="Sung by GUIDERUS and ARVIRAGUS over F I D E L E, supposed to be dead. By the Same. I. T O fair Fidele's grassy tomb Soft maids, and village hinds lhall bring Each op'ning fieet, of earliert bloom, And rifle all the breathing Spring. II. No wailing ghost shall dare appear To vex with shrieks this quiet grove: But Ihepherd lads assemble here, And melting virgins, in their notes, III. With voices tun'd to sweetest air, Shall sing, while Fidele's dust is seen, A requiem, that shall make the flowers Burst from their earth, and dance in green. IV. The winds, that now their wings unfold, Shall hear, and from their mountain caves Unfold their harps, and join the sounds Of liquid grief, and liquid joys. V. While Fidele's voice, in silent air, Shall hover round the weeping bower, And bid the flowers, and all things there, Weep, till their tears become a shower. VI. And when the last sad note is done, And all things seem to sink in woe, The sun shall rise, and scatter light O'er Fidele's grave, and all below. VII. And Fidele's spirit, from its sleep, Shall rise, and on the flowers sit, And smile upon the weeping grief Of those who loved him, and will miss. VIII. Thus shall the living mourn the dead, And all things pay their tribute due; While Fidele's name, like music, spread O'er earth, and all the world below. Note: GUIDERUS and ARVIRAGUS are characters in the poem, possibly representing the speaker or a group of people mourning the loss of Fidele. The poem is a tribute to Fidele, who is supposed to be dead, and it describes a series of events where nature, spirits, and even the sun come together to honor his memory."

In [10]:
print(test1)
print(" ".join(examples[1]["reference"].replace("\n", " ").split()))

 A SONG FROM SHAKESPEAR's CYMBELINE. Sung by GUIDERUS and ARVIRAGUS over FIDELE, ſuppoſed to be dead. By the Same. I. TO fair Fidele's graſſy tomb Soft maids, and village hinds ſhall bring Each op'ning ſweet, of earlieſt bloom, And rifle all the breathing Spring. II. No wailing ghoſt ſhall dare appear To vex with ſhrieks this quiet grove: But ſhepherd lads aſſemble here, And melting virgins own their love. 
A SONG FROM SHAKESPEAR's CYMBELINE. Sung by GUIDERUS and ARVIRAGUS over FIDELE, ſuppoſed to be dead. By the Same. I. TO fair Fidele's graſſy tomb Soft maids, and village hinds ſhall bring Each op'ning ſweet, of earlieſt bloom, And rifle all the breathing Spring. II. No wailing ghoſt ſhall dare appear To vex with ſhrieks this quiet grove: But ſhepherd lads aſſemble here, And melting virgins own their love.


In [7]:
input_test, output_test = examples[0]["input"], examples[0]["output"]

a = str(align(input_test.replace("\n", ""), output_test.replace("\n", "")))
a=a.split("\n")
print(a)

['certainty as to compos-ers in the fifteenth century;-for not only the madrigals that were invented after-the new notation were at that time printed, but many-of the old ones were made to ass--ume this more per-s-ect form, and, therefore, are pres-erved even to this-day. "; Su7mer is Icumen, ------------------a celebrated madrigal forf--ix voices, the manus-cript of which is now in the-Britis-h Mus-eum, was compos-ed about 1460. SKEL-TON, in the reign of HENRY the s-eventh, wrotes--ongs, which were compos-ed in parts by CORNISI1-,-and many others might be mentioned.---------------FRANCHINUS, who wrote a work which wa~--printed at MILAN, gives s-ome of the firs-t examples-for the improvement of mus-ical notation, but thes-e-characters were cut out in blocks; the Gern-ans,-however, improved upon this practis-e, and that arts--eems to have arrived to s-omething like perfection-about the year 1500, s-o that this improvements--eemed ready for the us-e it was pur- to afterwards in-ENGLAND; 

In [28]:
def extract_aligned_text(text_for_extraction, alignment, with_indices=False, i_start=0, i_end=0):       #still in progress
    #print(str(alignment).split('\n'))
    base_text, alignment_string, query_text, _ = str(alignment).split('\n')

    if not with_indices:
        matches = list(re.finditer(r'\|+', alignment_string))
        
        last_match = max(matches, key=lambda m: m.end())
        first_match = min(matches, key=lambda m: m.start())
        start, end = first_match.start(), last_match.end()
    elif with_indices:
        start, end = i_start, i_end

    character_start = base_text[start]
    character_end = base_text[end]
    start_total = 0
    end_total = 0
                                                    #the rest is not so clean and just a bunch of tricky stuff 

    spaces_count_in_aligned = 0                    #taking into account the leading spaces added by the alignment
    for char in base_text.replace("-", " "):
        if char == ' ':
            spaces_count_in_aligned += 1
        else:
            break

    spaces_count_in_extraction_text = 0
    for char in text_for_extraction:
        if char == ' ':
            spaces_count_in_extraction_text += 1
        else:
            break

    diff = spaces_count_in_aligned - spaces_count_in_extraction_text
    
    start_total = base_text[diff:start].count(character_start)  #trying to get the rank of a character
    end_total = base_text[diff:end].count(character_end)    

    start_count, end_count = 0, 0
    index_start, index_end = 0, 0

    print("character_start:",character_start, "start_total:", start_total,
          "character_end:", character_end, "end_total:", end_total)
    for i in range(len(text_for_extraction)):          #basically just counting how many times the characters appear to try and find the position it was in the original text
        if start_count==start_total&end_count==end_total:
            print("wow")
            break
        if end_count<end_total:
            if text_for_extraction[i]==character_end:
                end_count+=1
                index_end=i
        if start_count<start_total:
            if text_for_extraction[i]==character_start:
                print("wowie")
                start_count+=1
                index_start=i

    if end_count!=end_total:     #happens when the ending character is "-". we just then return the whole text 
        index_end = -1

    print(start_count, end_count)
    #aligned_text = base_text[index_start:index_end]
    return index_start, index_end 

def find_longest_sequence_of_ones(smoothed_scores_list):  #gpt inspired function
    max_length = 0
    current_length = 0
    start_index = 0
    best_start_index = -1
    best_end_index = -1
    
    for i, value in enumerate(smoothed_scores_list):
        if value == 1:
            if current_length == 0:
                start_index = i 
            current_length += 1
        else:
            if current_length > max_length:
                max_length = current_length
                best_start_index = start_index
                best_end_index = i - 1
            current_length = 0

    #check if the longest sequence ends at the last element
    if current_length > max_length:
        max_length = current_length
        best_start_index = start_index
        best_end_index = len(data) - 1
    
    print(f"The longest sequence of 1s starts at index {best_start_index} and ends at index {best_end_index}.")
    
    return best_start_index, best_end_index

def smoothing_window(text_for_extraction, alignment, smoothing_range=10, minimal_score=0.8):
    base_text, alignment_string, query_text, _ = str(alignment).split("\n")

    n = len(alignment_string)
    left_side_smoothing_scores = [0 for i in range(n)]
    right_side_smoothing_scores = [0 for i in range(n)]

    #going from left to right
    for i in range(n):
        window = alignment_string[i-smoothing_range:i]
        score = window.count("|")/smoothing_range
        left_side_smoothing_scores[i] = score

    #going from right to left 
    for i in reversed(range(n)):
        window = alignment_string[i:i+smoothing_range]
        score = window.count("|")/smoothing_range
        right_side_smoothing_scores[i] = score

    smoothed_scores = [min(left_side_smoothing_scores[i], right_side_smoothing_scores[i]) for i in range(n)]
    is_good_enough = []
    for elt in smoothed_scores:
        if elt>=minimal_score:
            is_good_enough.append(1)
        else:
            is_good_enough.append(0)

    start_smooth_alignment, end_smooth_alignment = find_longest_sequence_of_ones(is_good_enough)

    print(start_smooth_alignment, end_smooth_alignment)
    print(base_text[571:1352])
    #extracting the text for evaluation
    start, end = extract_aligned_text(text_for_extraction, alignment, with_indices=True, i_start=start_smooth_alignment, i_end=end_smooth_alignment)

    return start, end
    
# Example usage
#base_text = a[0]
#alignment_string = a[1]
#query_text = a[2]
alignment = base_text+"\n"+alignment_string+"\n"+query_text+"\n"
s, e =smoothing_window(text_for_extraction, alignment)
print('start:', s, "end:", e)
aligned_text = text_for_extraction[s:e]
print("Aligned Text:", aligned_text)

The longest sequence of 1s starts at index 583 and ends at index 1177.
583 1177
----  FRANCHINUS, who wrote a work which wa~  printed at MILAN, gives some of the first examples for the improvement of musical notation, but these characters were cut out in blocks; the Gernans, however, improved upon this practise, and that art seems to have arrived to something like perfection about the year 1500, so that this improvement seemed ready for the use it was pur to afterwards in ENGLAND; but it came to no perfection till about 1560, when a very indultrious man, of the name of JOHN DAY, published the Church Service in four and three parts. IIis labours were a good deal ac- celerated by STERIIOLD) and HOPKINS; who, in addition to the novelty of introducing their New Version of the Psalms, brought forth the Cantiones of "'ALLIS and BIRD, two names of fulIicic
character_start: I start_total: 3 character_end:   end_total: 203
wowie
wowie
wowie
3 203
start: 502 end: 1306
Aligned Text: I1,
and many 

In [21]:
alignment = """                A SONG FROM SHAKESPEAR's CYMBELINE.  ----   ----        ------------ ---------  Sung by GUIDERUS and ARVIRAGUS over- F-I-D-E-L-E, ſuppoſed to be dead.  ------------  --------------------------------  -----------------------------------------  --------------------------------------------------------------------------  ---  ----------------------------------  -------------------------------------  B-y the----- Sa----------------------me--------------------------------------. --------------------------------------------------------------------  -------  ----  -----  ----  ---  -------  ---  ----  ----  ---  ---  ----  -----  ----  ---  -------  I-. ------------------------------------------------------------------------------------  ----  -----  -----  ----  ---  -----  -----  --  T-------------------------------------O f---------------a---i-----------------------------------------------------------------------r -------Fide----------------------------------------------------------------------------------------------------------------------------------------------l------------e-'s --------------------------------------------gr-----a------------ſſy tomb --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------  ----  -----  -------  ---  -----  ---  -----  ----  S-------------------------------------------------------o---------------------------------f------------------------------t-------------------------- m-------------------------------------------a---------------i--------------------d------------------------s, ---------------------------------------------------------------------a------------------------nd -------------v-----------------------i-------------------------------------------------------ll-----------------a-------------g--------------e ----------------------------------------------------------h----------------i--------------------------------------------------nd-----------------s- ſhall---------- bri---------------------------------------------------ng  ----------------------------------------  ------  ---  -----  -----  -----  ---  -----  --- Ea-----------------ch--------- o------------------------------------------------------p'n---------------------------------------i-------------------------------------------------n-------------------------------------g---------------- ſw----------------------------------------------------------------------------------------e------------------------e------t,------------------------------------ of -------------------------------------------------------------------------e------a-----------------------------------rl----------------------------------------------------------------------ie-----------------------------------------------------------ſt blo-------------------------------------------o------------m---------------, --------------------------------------  ------  ------  ---  -----  -----  -----  ---  -----  And ------------------------------------ri-f----------------------------l----------e-------------- all the-------------------------- br---------------e-----------a----------th-----------in-------------------g S---------------pri--------------------------------------------------n-------------------------------------g-----------. ------------------------------  -----  ---  ---  ---  ------  -----  ---  --  ------  ------  ---  -----  -----  -----  ---  -----  ---  ---  ---  ------  -----  ---  --  ------  ----II.  ------------------------  ---  -----  ---  ---  ---  ------  -----  --- No----------------------- w---------------------------------------------------------ail-------------i---------------------------------------------------n-------------------------------------g------------------------------------------------------------------------------------------- g-----------------h----o--ſt ſhall ----------------------d----------------------a--------------r------------------e -----------------------------------------------------------a-----------------------------ppe-----------a-------------r -----------  ---  ---  ------  -----  ---  --  ------  ---  T-------------------------------o------------------- --v-------------------------------------ex w-------------------------------------------------------------------------i-----------------------------------------------------------th----------- ſh---------------ri---------------------------------------------------------------------eks- th----------------i----------------------------------------------------------------------s ---------------------------------------------------qu-----------------------------------------------ie----t gr-------------------------------------------------------------------ove-:  -----  ---  --  ------  ------  ---  -----  -----   Bu-t ſhe-------------------pherd l------------------------------------------------------a-------------------------d-----------------s ---a----------------------------ſſe----------------m------------------------ble her-----------------------------------------------------------------e-------, --------------------------------------  ------  ---  ---  -----  -----  -----  ---  -----  And-------------------------------------------------------------------- m-------e---l-----ting -------------vir-------------------g-i---------------------------------------------------n------------------s------------- o----------------------w-----------------n the---------------------ir- love------------------------------. -----------  -----  -----  ---  -----  ---  ---  ---  ------  -----  ---  --  ------  ----  ---  -----  -----  -----  ---  -----  ---  ---  ---  ------  -----  ---\n--------------.||||||||||||||||||||||||||||||||||||||----|.|----||....||------------|---------.||||||||||||||||||||||||||||||||||||-||-|-|-|-|-|||.||||.||||||||||||||||------------||--------------------------------||-----------------------------------------||--------------------------------------------------------------------------||---||----------------------------------||-------------------------------------|||-.|.||-----|.|----------------------||--------------------------------------||--------------------------------------------------------------------||-------||----||-----||----||---||-------||---||----||----||---||---||----||-----||----||---||-------|||-||------------------------------------------------------------------------------------||----||-----||-----||----||---||-----||-----||--|||-------------------------------------.||---------------|---|-----------------------------------------------------------------------||-------.|||----------------------------------------------------------------------------------------------------------------------------------------------|------------|-.||--------------------------------------------.|-----|------------...||||||--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------||----||-----||-------||---||-----||---||-----||----|||-------------------------------------------------------|---------------------------------|------------------------------|--------------------------||-------------------------------------------|---------------|--------------------|------------------------|||---------------------------------------------------------------------|------------------------|||-------------|-----------------------|-------------------------------------------------------||-----------------|-------------|--------------||----------------------------------------------------------|----------------|--------------------------------------------------||-----------------|-|.||||----------|.||---------------------------------------------------|.||----------------------------------------||------||---||-----||-----||-----||---||-----||---|.|-----------------.|---------||------------------------------------------------------..|---------------------------------------|-------------------------------------------------|-------------------------------------|----------------|.|----------------------------------------------------------------------------------------|------------------------|------.|------------------------------------||||-------------------------------------------------------------------------|------|-----------------------------------||----------------------------------------------------------------------||-----------------------------------------------------------..|.||-------------------------------------------|------------|---------------||--------------------------------------||------||------||---||-----||-----||-----||---||-----||||||------------------------------------||-|----------------------------|----------|--------------|||||.||--------------------------|.|---------------|-----------|----------.|-----------.|-------------------.||---------------.||--------------------------------------------------|-------------------------------------|-----------||------------------------------||-----||---||---||---||------||-----||---||--||------||------||---||-----||-----||-----||---||-----||---||---||---||------||-----||---||--||------||----|||||------------------------||---||-----||---||---||---||------||-----||---|.|-----------------------||---------------------------------------------------------|.|-------------|---------------------------------------------------|-------------------------------------|-------------------------------------------------------------------------------------------||-----------------|----|--..|.|||||----------------------|----------------------|--------------|------------------||-----------------------------------------------------------|-----------------------------..|-----------|-------------||-----------||---||---||------||-----||---||--||------||---|||-------------------------------|-------------------|--|-------------------------------------|.||-------------------------------------------------------------------------|-----------------------------------------------------------.|-----------|.|---------------||---------------------------------------------------------------------|.|-|.|----------------|----------------------------------------------------------------------||---------------------------------------------------.|-----------------------------------------------||----.|||-------------------------------------------------------------------|||-.||-----||---||--||------||------||---||-----||-----||..|-.|.||-------------------.|||.||------------------------------------------------------|-------------------------|-----------------||---|----------------------------..|----------------|------------------------.|.||||-----------------------------------------------------------------|-------||--------------------------------------||------||---||---||-----||-----||-----||---||-----|||||--------------------------------------------------------------------||-------|---|-----..|.|-------------|.|-------------------|-|---------------------------------------------------|------------------|-------------||----------------------|-----------------||.||---------------------.|-|||||------------------------------||-----------||-----||-----||---||-----||---||---||---||------||-----||---||--||------||----||---||-----||-----||-----||---||-----||---||---||---||------||-----||---\n--------------# A SONG FROM SHAKESPEAR's CYMBELINE.  [71] A SONG  FROM  SHAKESPEAR's CYMBELINE. Sung by GUIDERUS and ARVIRAGUS over  F I D E L E, supposed to be dead.  By the Same.  I. T O fair Fidele's grassy tomb  Soft maids, and village hinds lhall bring  Each op'ning fieet, of earliert bloom, And rifle all the breathing Spring.  II.  No wailing ghost shall dare appear  To vex with shrieks this quiet grove:  But Ihepherd lads assemble here, And melting virgins own their love.  E 4 III. No  tum'l'd  bones  shall  stir  To  wail  the  departed  soul:  But  lilting  lips  shall  sing  her  praise,  And  lips  that  are  not  lips  shall  join  the  chorus.  IV.  The  lark  and  linnet  shall  affirm  Her  memory  in  the  air:  And  every  bird  that  flies  Shall  echo  her  sweet  song.  V.  The  blossom  that  her  hand  plucked  first  Shall  wear  its  garland  to  the  end:  And  every  flower  That  grows  Shall  wear  its  pride  upon  its  head.  VI.  The  stream  that  flow'd  from  off  Her  eye  Shall  be  her  tears  for ever:  And  every  drop  That  drops  Shall  be  her  tears  upon  the  earth.  VII.  The  moon  and  stars  Shall  light  her  tomb  with  beams:  And  all  the  heavens  Shall  shower  her  ashes  with  their  tears.  VIII.  The  nightingale  shall  sing  her  lullaby,  And  every  bird  That  sleeps  Shall  join  her  sighs.  IX.  The  wind  shall  whisper  her  name,  And  every  tree  Shall  answer  with  a  groan.  X.  The  earth  shall  cover  her  grave,  And  all  her  friends  Shall  lie  beneath  it.  XI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XIII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XIV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XVI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XVII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XVIII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XIX.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XX.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXIII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXIV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXVI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXVII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXVIII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXIX.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXX.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXXI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXXII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXXIII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXXIV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXXV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXXVI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXXVII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXXVIII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XXXIX.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XL.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XLI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XLII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XLIII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XLIV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XLV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XLVI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XLVII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XLVIII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  XLIX.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  L.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  LI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  LII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  LIII.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  LIV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  LV.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die  of  grief.  LVI.  The  world  shall  mourn  her  loss,  And  all  her  lovers  Shall  die\n"""

In [13]:
query_text = examples[4]["alignment"]["base"]

In [4]:
alignment_string = examples[4]["alignment"]["al_str"]

In [14]:
base_text= examples[4]["alignment"]["query"]

In [7]:
text_for_extraction = examples[4]["output"]

In [26]:
print(len(a))

4
