# Adjustment of the segmentation of Eynollah to merge whole articles and adjust the reading order 

In this notebook, we try to adapt the edition of Eynollah to the layout of the "Freedom Struggle" from 1936 in order to recognize whole on a page. The goal is to recognize the whole article, i.e. the headline and the appropriate text regions below it. This turns out to be a non-trivial task especially with historical newspapers as the reading order is not always intuitively the same. 
Afterwards, a full text indexing of the articles could be performed with an OCR pipeline.

In [1]:
from bs4 import BeautifulSoup
import cv2
import os
import numpy as np
import random
from copy import copy, deepcopy

path= "croped/"
output= "croped_colored/"

In [3]:
def delete_small(dic):
    """Delete all textregions which are too small. They are probably errors from the segmentation step

    Parameters
    ----------
    dic:  dicitonary
        key:    is the name of the textregion (in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
 
    Returns
    -------
    new_dic: dictionary
        Is the same format like the param "dic", but now has a few entrys less.  
    """
    
    dic = dict(sorted(dic.items(), key=lambda item: int(item[1][1]))) #sort by reading order
    new_dic = {}

    count = 0 
    for k,v in dic.items():
        start,end = get_start_end(v[0])
        
        area = (end[0]-start[0])*(end[1]-start[1]) #calculate area of rectangle
        
        #delete regions which have an area smaller then a certain area
        if area <= 1500:
            count +=1
            continue
        else:
            new_dic[k] = [v[0],int(v[1])-count]

    return new_dic
            

In [2]:


def get_start_end(coord_list):
    
    """Gets the top left corner and the right bottom corner of each bounding box. The coordinate system start in the left top corner of a picture (0,0).

    Parameters
    ----------
    coord_list : str
        Its a list of coordinates from a Page-XML file for the region of text (polygonomial).

    Returns
    -------
    start:
        a tuple of the smallest x and y value of the region (top left corner)
    end:
        a tuple of the maximum x and y value of the region (bottom right corner)
    """
    
    coord_list = coord_list.split(" ")
    coord_list = [tuple(map(int,tuple(x.split(",")))) for  x in coord_list]
    x_max = max(coord_list,key=lambda x:x[0])
    x_min = min(coord_list,key=lambda x:x[0])
    y_max = max(coord_list,key=lambda x:x[1])
    y_min = min(coord_list,key=lambda x:x[1])
        
        
    ## Coordinate System starts in the top left corner of the picture!
    start = (x_min[0],y_min[1])
    end = (x_max[0], y_max[1])
    
    return start,end


## The next functions are used to combine text regions within a column

In [5]:
def combine_regions_in_column(dic,textlines):
    
    """This function combines textregions (boundingboxes) in one column, which have nearly the same x values on the left and right side, and a small y distance between two regions.

    Parameters
    ----------
    dic:  dicitonary
        key:    is the name of the textregion (in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
    textlines: dictionary
        key:    Names of the textregions in the PAGE-XML
        values: A list. List[0] contains the Name of the textregion and a suffix for the line number (e.g. region_0053_line_0001). List[1] are the coordinates of the bounding box of the textline
    
    Returns
    -------
    new_dic: 
        key:    New Ids for the new reading order
        values  A list of strings, which have the name of the textregions, which are now combined in one bigger region. 
    """
    
    dic = dict(sorted(dic.items(), key=lambda item: int(item[1][1]))) #sort by reading order
    new_dic = {}
   
    count = 0
    
    for i,(k,v) in enumerate(list(dic.items())):
       
    
        #two dummyelements for very first and the very last textregion on the page
        if i==0:
            last_entry = [0,["0,0",0]]
        else:
            last_entry = list(dic.items())[i-1]
        
        if i==len(dic.items())-1:
            next_entry = [0,["0,0",0]]
        else:
            next_entry = list(dic.items())[i+1]
        
        
        #here we calculate the top left and bottom right points for the last, the present and the next bounding bos
        start,end = get_start_end(v[0])
        n_start, n_end = get_start_end(next_entry[1][0])
        l_start, l_end = get_start_end(last_entry[1][0])
        
        
       
        if check_if_near(start[0],n_start[0], end[0], n_end[0], abs(end[1]-n_start[1])) and (int(v[1])+1 == next_entry[1][1]) and not is_one_liner(k, textlines):
           
            if count not in new_dic:
                
                new_dic[count]=[k]
            else: 
                new_dic[count].append(k)
        
        elif check_if_near(l_start[0],start[0], l_end[0], end[0], abs(l_end[1]-start[1])) and (int(v[1])-1 == last_entry[1][1]) and not is_one_liner(k, textlines):
            if count not in new_dic:
                
                new_dic[count]=[k]
            else: 
                new_dic[count].append(k)
            
            count+=1
        else:
            count+=1
            new_dic[count]=[k]
            count+=1
     
    #rename the keys of the dictionary for a new clean order
    new_dic = {index: value for index, (_, value) in enumerate(new_dic.items())}
    
    new_dic = possible_headline_in_column(dic, new_dic)
    
    new_dic = {index: value for index, (_, value) in enumerate(new_dic.items())}
    
    return new_dic
    
        
    

In [4]:

def check_if_near(top_left, next_top_left, bottom_right, next_bottom_right,height_dist):
    """Check if two boundingboxes (above each other) with certain x and y values are near to each other

    Parameters
    ----------
    top_left: int
        x-coordinate of the top left of the upper bounding box
    next_top_left: int
        x-coordinate of the top left of the lower bounding box
    bottom_right: int
        x-coordinate of the bottom right of the upper bounding box
    next_bottom_right: int
        x-coordinate of the bottom right of the lower bounding box
    height_dist: int
        distance between the y-coordinate of the bottom right_corner of the upper bounding box and the y-coordinate of the top left corner of the lower boundingbox
        
    Returns
    -------
    True:
        If two boundingboxes are in the same column and have nearly the same x values on the left and right side and the distance between is very small
    False:
        If not so.
    """
    
    
    if (abs(top_left-next_top_left) + abs(bottom_right-next_bottom_right) <250) and height_dist <20:
     
        return True
    else:
        return False

In [6]:
def is_one_liner(region,textlines):
    """Checks if a textregions only consists of one line. Which means it is possibly a headline

    Parameters
    ----------
    region:  string
        Name of the textregion in the PAGE-XML
    textlines: dictionary
        key:    Names of the textregions in the PAGE-XML
        values: A list. List[0] contains the Name of the textregion and a suffix for the line number (e.g. region_0053_line_0001). List[1] are the coordinates of the bounding box of the textline
    
    Returns
    -------
    True
        if the textregion has more then one line
    False
        if the textregion has just one line or is not in the textline dictionary(which mean it could be a picture or something else)
        
    """
    if region in textlines:
        if len(textlines[region])==1:
            return True
        else:
            return False
    return False

In [7]:
def possible_headline_in_column(dic, combined):
    
    """This function checks if one, two or three textregions above other textregions could be a title and a subtitle, but only for one column headlines!

    Parameters
    ----------
    dic:  dicitonary
        key:    is the name of the textregion in the Page-XML (in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
    combined: dictionary
        key:    Readingorder ID for the textregions
        values: Its a list. The list contains the regions of already combined regions (from heuristics before)
    
    Returns
    -------
    new: 
        a dictionary quite similar to the dictionary "combined". But now possible headers and there matching textareas are combined in one entry in the dictionary
    """

    new = {k:v for k,v in combined.items()}
    
    
    for i,(k,v) in enumerate(list(combined.items())):
        if i==0:
            continue
        
        last_entry = combined[i-1]
        
        #skip headlines that are possible multicolumn headlines
        start,end = get_start_end(dic[v[0]][0])
        if end[0]-start[0] > 420:
            continue 
        
        
        # in this section we try to combine Title and subtitle. But also Title and text if there is no subtitle
        if len(last_entry) == 1:
            start_sec,end_sec= get_start_end(dic[v[0]][0])
            start_first_headline, end_first_headline = get_start_end(dic[last_entry[0]][0])
            
            #here you can change the parameters for the headline bounding boxes
            if (start_sec[0]-220 <= start_first_headline[0] <= end_sec[0]) and (start_sec[0] <= end_first_headline[0] <= start_sec[0]+400) and (abs(start_first_headline[1]-start_sec[1])<70):
                    new[k].insert(0,last_entry[0])
                    new[k-1].remove(last_entry[0])
        
        
        #this sections trys to combine a recognized title and subtitle region with the matching textregion
        if len(last_entry) == 2:
            start_sec,end_sec= get_start_end(dic[v[0]][0])
            start_first_headline, end_first_headline = get_start_end(dic[last_entry[0]][0])

            if (start_sec[0]-280 <= start_first_headline[0] <= end_sec[0]) and (start_sec[0] <= end_first_headline[0] <= end_sec[0]+40) and (abs(start_first_headline[1]-start_sec[1])<100):
                    new[k].insert(0,last_entry[0])
                    new[k].insert(1,last_entry[1])
                    new[k-1].remove(last_entry[0])
                    new[k-1].remove(last_entry[0])
        
        #here we look at headlines consisting of 3 textregions
        if len(last_entry) == 3:
            start_sec,end_sec= get_start_end(dic[v[0]][0])
            start_first_headline, end_first_headline = get_start_end(dic[last_entry[0]][0])

            if (start_sec[0]-200 <= start_first_headline[0] <= end_sec[0]) and (start_sec[0] <= end_first_headline[0] <= end_sec[0]+40) and (abs(start_first_headline[1]-start_sec[1])<100):
                    new[k].insert(0,last_entry[0])
                    new[k].insert(1,last_entry[1])
                    new[k].insert(2,last_entry[2])
                    new[k-1].remove(last_entry[0])
                    new[k-1].remove(last_entry[0])
                    new[k-1].remove(last_entry[0])

    new={k: v for k, v in new.items() if len(v)>0} 
    return new
    


# The next functions first look for multi-column headings and then combine them with the first text section that starts in the upper left corner under the heading

In [8]:
def possible_headline_multicolumn(dic, combined,textlines):
    
    """This function checks if one or two textregions above other textregions could be a title and a subtitle, but only for multicolumn headlines!

    Parameters
    ----------
    dic:  dicitonary
        key:    Is the name of the textregion of the Page-XML(in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
    combined: dictionary
        key:    Readingorder ID for the textregions, because there are now combined, and the bigger combined regions gets a new id
        values: Its a list. The list contains the regions of already combined regions (from heuristics before)
    textlines: dictionary
        key:    textregion id of the page_xml
        values: List of all lines associated to the textregion
    
    Returns
    -------
    new: 
        A dictionary where the values of the keys can be one or more regions which can be seen as headlines
    """
    
    
    avg_height = sum([textline_heights(reg,textlines) for k,v in combined.items() if len(v) > 1 for reg in v ],[])
    avg_height = sum(avg_height)/len(avg_height)

    
    
    headlines = []
    for k,v in combined.items():
        
        
        if len(v) == 1:   
            #check if less then 4 textlines in headline (otherwise probability is high that the region is not a headline)
            if v[0] in textlines:
                if len(textlines[v[0]])<4:
                    bigger = False
                    start,end = get_start_end(dic[v[0]][0])
                    width=end[0]-start[0]
                        
                    for line in textline_heights(v[0], textlines):
                        #check if the textline are at least 20% higher then the average textline
                        if line > (avg_height * 1.2) and width >350:
                            bigger = True
                        else:
                            bigger = False
                            break
                    if bigger:
                        headlines.append(v[0])
    
    titles={}


    for i,title in enumerate(headlines):

        if i == 0:
            titles[i] = [title]
        last_region = headlines[i-1]
        last_start,last_end = get_start_end(dic[last_region][0])
        start,end = get_start_end(dic[title][0])
        
        x_diff = abs(start[0]-last_start[0]) + abs(end[0]-last_end[0])
        y_diff = abs(last_end[1]-start[1])
        
        if x_diff < 500 and y_diff < 20:
            titles[i] = [last_region,title]
            if i!= 0:
                del titles[i-1]
            
        else:
            titles[i] = [title]
    
    titles = {index: value for index, (_, value) in enumerate(titles.items())} 
    return titles

In [9]:
def textline_heights(region, textlines):
    
    """Gets the textline heights (the heigths of a bounding box) of one region

    Parameters
    ----------
    region:    string
        A region of the PAGE-XML
    textlines: dictionary
        key:    textregion id of the page_xml
        values:List of all lines associated to the textregiony)
    
    Returns
    -------
    heights:   list
        A list of all textline-heights (in Pixels) of the textregion
    """
    
    heights = []
    if region in textlines:
        for textline in textlines[region]:
            start,end = get_start_end(textline[1])
            heights.append(end[1]-start[1])
        
    return heights

In [10]:
def combine_multicolumn_text_and_headlines(dic, combined,headlines):
    
    """This function combines a multicolumn headline with the associated textregion beneath it.

    Parameters
    ----------
    dic:  dicitonary
        key:    Is the name of the textregion of the Page-XML(in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
    combined: dictionary
        key:    Readingorder ID for the textregions, because there are now combined, and the bigger combined regions gets a new id
        values: Its a list. The list contains the regions of already combined regions (from heuristics before)
    headlines: dictionary
        key:    Placeholder IDs. They have no function. 
        values: List of Strings, which are the regions which belong to the headline
        
    Returns
    -------
    new: 
        A dictionary where now multicolumn headlines and Textregions on the most left column under the title are combined.
    """
    
    new = deepcopy(combined)
    
    for k,v in combined.items():
        for key,value in headlines.items():
            
            if v[0] != value[0]:
                start_title, end_title = get_start_end(dic[value[0]][0])
                start_text, end_text = get_start_end(dic[v[0]][0])
    
                x_diff = start_title[0]-start_text[0]
                y_diff = start_text[1]-end_title[1]
                
                if -180 < x_diff < 200 and 0<=y_diff<=100:
                    if len(value)==2:
                        new[k].insert(0, value[1])
                        new[k].insert(0, value[0])
                    else:
                        new[k].insert(0, value[0])
    all_headers = sum(headers.values(),[])
    for k,v in new.items():
        # here all headlines that are not related to an article get erased. You may want to change that one
        if all(elem in all_headers for elem in v):
            new[k]=[]
    
    new = {k: v for k, v in new.items() if len(v)>0} 
    new = {index: value for index, (_, value) in enumerate(new.items())}
    return new

# These functions now combine all text regions below a multi-column heading across multiple columns

In [11]:
def combine_columns(dic, comb, headers, textlines, separators):
    """This function combines all columns in one article.

    Parameters
    ----------
    dic:  dicitonary
        key:    Is the name of the textregion of the Page-XML(in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
    comb: dictionary
        key:    Readingorder ID for the textregions, because there are now combined, and the bigger combined regions gets a new id
        values: Its a list. The list contains the regions of already combined regions (from heuristics before)
    headlines: dictionary
        key:    Placeholder IDs. They have no function. 
        values: List of Strings, which are the regions which belong to the headline
    textlines: dictionary
        key:    textregion id of the page_xml
        values: List of all lines associated to the textregion
    separators: list
        Each element is a string which contains all coordinates from the separator of the PAGE-XML file
        
    Returns
    -------
    new_articles: 
        A dictionary where all columns beneath a multicolumn headline are combined.
    """

    #we create an extra dictionary for the multicolumn articles
    articles = {}
    count = 0
    for k,v in headers.items():
        
        same_article = False
        i = 0
        
        
        while i < len(comb):
            
            #we need to make a deepcopy, because we dont want to edit the original order
            value = deepcopy(comb[i])
            
            start_head,end_head = get_start_end(dic[v[0]][0])
            start_text,end_text = get_start_end(dic[value[0]][0])
            
            #we save of the coordinates of the last region within the combination headline and first top left paragraphs
            if not count in articles:
                start_very_last,end_very_last = (0,0),(0,0)
            else:
                start_very_last,end_very_last = get_start_end(dic[articles[count][-1]][0])
                
            
            #if there is a new region right under it we save it to the article
            if (-10 <= start_text[1]-end_very_last[1] <= 60) and (-20 <= start_text[0]-start_very_last[0] <=100):
                articles[count].extend(value)

            
            if v[0] == value[0] or same_article:
                
                #check if headline or separator (working as article border) is beneath
                if check_border(value[-1],headers,separators,dic):

                    #same_article is when we detected multiple different regions in one article
                    if same_article:
                        #when we are here, it means we are at a region which has a border under it and is not directly under the headline (the are already several regions in between (within the same column))
                        tmp = i
                        i = continue_in_next_column(articles[count],v,dic,comb,textlines)
                        if i:
                            articles[count].extend(comb[i])
                            i+=1
                            
                        else:
                            articles[count].extend(comb[tmp])
                            count +=1
                            break
                    
                    else:
                        #when we are we are at a paragraph which also has a border under it and is directly under the headline
                        articles[count] = comb[i]
                        tmp = i
                        i = continue_in_next_column(value,v,dic,comb,textlines)
                        if i:
                            articles[count].extend(comb[i])
                            same_article = True
    
                        else:
                            i = tmp
                            count += 1
                            break
                        
                else:
                    #if we are here we have a paragraph which is not the first under a headline and has no border under it. But it is the same article
                    i+=1
                    same_article = True
                    if count in articles:
                        articles[count].extend(value)
                    else:
                        articles[count]=value
                    
            else:
                i+=1
         
    
    articles = {index: value for index, (_, value) in enumerate(articles.items())}
    articles = {key: list(dict.fromkeys(value)) for key,value in articles.items()}
    
    new_articles = {} 
    ind=0
    
    
    for comb_key,comb_value in combined.items():
        if set(comb_value).issubset(sum(new_articles.values(),[])):
            continue
        
        for art_key, art_value in articles.items():
            
            if set(comb_value).issubset(art_value):
                new_articles[ind] = art_value
                break
            else:
                new_articles[ind] = comb_value
        ind +=1
    
    
    return new_articles
        
    
    

In [12]:
def continue_in_next_column(textregions, header,dic,combined,textlines):
    """Searches for the next region in the next column within the same article

    Parameters
    ----------
    textregions: list
        A list of strings of the name of textregions. The whole list depicts an already combined paragraph.
    header: list
        A list of regions which represented the headings of the related article.  
    dic:  dicitonary
        key:    Is the name of the textregion of the Page-XML(in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
    combined: dictionary
        key:    Readingorder ID for the textregions, because there are now combined, and the bigger combined regions gets a new id
        values: Its a list. The list contains the regions of already combined regions (from heuristics before)
    textlines: dictionary
        key:    textregion id of the page_xml
        values: List of all lines associated to the textregion
    separators: list
        Each element is a string which contains all coordinates from the separator of the PAGE-XML file
        
    Returns
    -------
    k:   int
        the reading order number (which is the key of the dictionary "combined") for the paragraph which is the first one in the next column under the headline in the same article
    """


    widths=[]
    for k,v in textlines.items():
        points = [get_start_end(w[1]) for w in v]
        for point in points:
            widths.append(point[1][0]-point[0][0])
    avg_width = 390
    
    third_column = False
    
    start_header,end_header = get_start_end(dic[header[0]][0])
    
  
    first_segment = textregions[len(header)]
    start_first, end_first = get_start_end(dic[first_segment][0])
    
    for k,v in combined.items():
        #check if the paragraph in the next column is on the same height as the last one
        start_next, end_next = get_start_end(dic[v[0]][0])
        
        y_diff = abs(start_next[1]-start_first[1])
        x_diff = start_next[0] - (start_first[0] + avg_width)

        if y_diff <= 100 and -70 <= x_diff <= 70 and (start_header[0] <= start_next[0] <=end_header[0] or end_header[0] <= start_next[0] <=end_header[0] ):
            
            if v[0] in textregions:
                
                if third_column:
                    avg_width = avg_width * 1.5
                else:
                    avg_width = avg_width * 2
                    third_column = True
                    continue
            else:
                return k
    return None
    
   
        

In [13]:
def check_border(region,headers,separators,dic):
    """Checks if under a certain region is a border (which means a multicolumn headline, a separator, or the end of the page)

    Parameters
    ----------
    region: string
        the name of the region where we want to look if a border is under it
    headers: list
        key:    A placeholder Id for the headlines. 
        values: A list of strings. The list hold all region names which represent the headline
    separators: list
        Each element is a string which contains all coordinates from the separator of the PAGE-XML file
    dic:  dicitonary
        key:    Is the name of the textregion of the Page-XML(in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order

        
    Returns
    -------
    boolean  
        returns True if a border is under the region.
    """


    sep_list = [get_start_end(sep) for sep in separators]
    
    head_list = [get_start_end(dic[head][0]) for head in sum(headers.values(),[])]
    
    start_text,end_text = get_start_end(dic[region][0])
    
        
    
    for border in sep_list:
        start_border,end_border = border[0],border[1] 
        y_diff = start_border[1]-end_text[1]
        
        if -10 <= y_diff <= 100 and ((start_border[0] <= start_text[0] <= end_border[0]) or (start_border[0] <= end_text[0] <= end_border[0])):
  
            return True
    
    for border in head_list:
        start_border,end_border = border[0],border[1] 
        y_diff = start_border[1]-end_text[1]
        
        if -10 <= y_diff <= 150 and ((start_border[0] <= start_text[0] <= end_border[0]) or (start_border[0] <= end_text[0] <= end_border[0])):

            return True
    
    end_of_page = 2700 - end_text[1]
    
    if end_of_page <= 300:
        return True
    
    return False
    
        

# Main Part

Reading the Page-XML of eynollah, drawing bounding boxes arround recognized textregions and apply some functions to combine those regions to articles.


In [14]:
count= 1
for filename in os.listdir(path):
    if not filename.endswith('.xml'): continue
    fullname = os.path.join(path, filename)

    with open(fullname, 'r') as f:
        data = f.read()
        data= BeautifulSoup(data, "xml")
        
        #extract Text regions from the PAGE-XML
        regions = data.find_all("TextRegion")
        #extract Separatorregions from PAGE-XML
        separator_list = data.find_all("SeparatorRegion")
        #extract the Reading order of each Textregion
        order = {x["regionRef"]:x["index"]for x in data.find_all("RegionRefIndexed")} #this is already the sorted order beginning from order[0]
   
    number = fullname.split("/")[-1].split(".")[0]
    
    #dictionary to get access to the coordinate of all lines of all textregions
    textlines={}
    for reg in regions:
        for line in reg.find_all("TextLine"):
            if reg.get("id") not in textlines:
                textlines[reg.get("id")] = [[line.get("id"),line.find("Coords")["points"]]] 
            else: 
                textlines[reg.get("id")].append([line.get("id"),line.find("Coords")["points"]])
                
    #dictionary for all textregions and their coordinates
    regs={}
    for reg in regions:
        regs[reg.get("id")] = [reg.find("Coords")["points"],order[reg.get("id")]]
    
    #list of all coordinate of the separators
    separators=[]
    for separ in separator_list:
        separators.append(separ.find("Coords")["points"])
        
    #delete all too small wrong annotated regions
    regs = delete_small(regs)
    
    #combined regions in one column
    combined = combine_regions_in_column(regs,textlines)
    
    #get headlines
    headers = possible_headline_multicolumn(regs, combined, textlines)
    
    #combine haedlines and first column
    combined = combine_multicolumn_text_and_headlines(regs, combined, headers)
    
    #combine headlines and multiple columns
    combined = combine_columns(regs, combined, headers, textlines, separators)
    
    img = cv2.imread(path + number + ".jpg")
    overlay = img.copy()

    #1. draw the separators
    for sep in separators:
        start,end = get_start_end(sep)
        
        #draws a red rectangle arround the separator
        img = cv2.rectangle(img, start, end, (0,0,255), -1)


    """
    for k,v in regs.items():
        
        #v[0] is the coordinate list, v[1] is the reading order rank
        start,end = get_start_end(v[0])


        #assigns a random colour to each rectangle
        col = tuple(random.randrange(256) for _ in range(3))
        img = cv2.rectangle(img, start, end, col, -1)

        #puts the reading order to the rectangles
        
        #img = cv2.putText(img, str(v[1]), (start[0]+5,start[1]+20),  cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,0), 2)
   
    """
    #2. draw the Textregion rectangles
    for k,v in combined.items():
        
        #get a random colour for each textregion
        col = tuple(random.randrange(256) for _ in range(3))
        textposition = (get_start_end(regs[v[0]][0]))        
        
        for reg in v:
        
            #v[0] is the coordinate list, v[1] is the reading order rank
            start,end = get_start_end(regs[reg][0])

            img = cv2.rectangle(img, start, end, col, -1)
            
            if reg not in textlines:
                continue
            else:
                for line in textlines[reg]:
                    s,e = get_start_end(line[1])
                    
                    #puts small red rectangles arround the textlines
                    img = cv2.rectangle(img, s, e, (0,0,255), 1)
                    
        #3. put the reading order to the rectangles
        img = cv2.putText(img, str(k), (textposition[0][0]+5, textposition[0][1]+25),  cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0), 4)
    
    
    alpha =0.4
    #creates a certain opacity for the rectanlge colours
    img = cv2.addWeighted(overlay, alpha, img, 1 - alpha, 0)

    result = output + "result_" +number +".jpg"
    print(result)
    cv2.imwrite(result, img)
    count +=1
    

croped_colored/result_croped_00000009.jpg
croped_colored/result_croped_00000010.jpg
croped_colored/result_croped_00000015.jpg
croped_colored/result_croped_00000017.jpg
croped_colored/result_croped_00000018.jpg
croped_colored/result_croped_00000005.jpg
croped_colored/result_croped_00000014.jpg
croped_colored/result_croped_00000016.jpg
croped_colored/result_croped_00000001.jpg
croped_colored/result_croped_00000002.jpg
croped_colored/result_croped_00000012.jpg
croped_colored/result_croped_00000004.jpg
croped_colored/result_croped_00000003.jpg
croped_colored/result_croped_00000008.jpg
croped_colored/result_croped_00000006.jpg


## Further possibilities for improvement

- improve article recognition across multiple columns (especially when an article continues in the next column and is no longer under the multi-column heading)
- check in column if separator is in between paragraphs.
- recognize and include images and free spaces on the page.