# Adjustment of the segmentation of Eynollah to merge whole articles and adjust the reading order 

In this notebook, we try to adapt the edition of Eynollah to the layout of the "Freedom Struggle" from 1936 in order to recognize whole on a page. The goal is to recognize the whole article, i.e. the headline and the appropriate text regions below it. This turns out to be a non-trivial task especially with historical newspapers as the reading order is not always intuitively the same. 
Afterwards, a full text indexing of the articles could be performed with an OCR pipeline.

In [94]:
from bs4 import BeautifulSoup
import cv2
import os
import numpy as np
import random
from copy import copy, deepcopy

path= "croped/"
output= "croped_colored/"

In [2]:
def delete_small(dic):
    """Delete all textregions which are too small. They are probably errors from the segmentation step

    Parameters
    ----------
    dic:  dicitonary
        key:    is the name of the textregion (in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
 
    Returns
    -------
    new_dic: dictionary
        Is the same format like the param "dic", but now has a few entrys less.  
    """
    
    dic = dict(sorted(dic.items(), key=lambda item: int(item[1][1]))) #sort by reading order
    new_dic = {}

    count = 0 
    for k,v in dic.items():
        start,end = get_start_end(v[0])
        
        area = (end[0]-start[0])*(end[1]-start[1]) #calculate area of rectangle
        
        #delete regions which have an area smaller then a certain area
        if area <= 1500:
            count +=1
            continue
        else:
            new_dic[k] = [v[0],int(v[1])-count]

    return new_dic
            

In [3]:
## das auch nochmal überarbeiten (Vor allem case wenn bild in Spalte mit drinnen ist)

def check_if_near(top_left, next_top_left, bottom_right, next_bottom_right,height_dist):
    """Check if two boundingboxes (above each other) with certain x and y values are near to each other

    Parameters
    ----------
    bottom_left: int
        x-coordinate of the top left of the upper bounding box
    next_bottom_left: int
        x-coordinate of the top left of the lower bounding box
    bottom_right: int
        x-coordinate of the bottom right of the upper bounding box
    next_bottom_right: int
        x-coordinate of the bottom right of the lower bounding box
    height_dist: int
        distance between the y-coordinate of the bottom right_corner of the upper bounding box and the y-coordinate of the top left corner of the lower boundingbox
        
    Returns
    -------
    True:
        If two boundingboxes are in the same column and have nearly the same x values on the left and right side and the distance between is very small
    False:
        If not so.
    """
    
    
    if (abs(top_left-next_top_left) + abs(bottom_right-next_bottom_right) <40) and height_dist <20:
     
        return True
    else:
        return False

In [4]:
def combine_regions_in_column(dic):
    
    """This function combines textregions (boundingboxes) in one column, which have nearly the same x values on the left and right side, and a small y distance between two regions.

    Parameters
    ----------
    dic:  dicitonary
        key:    is the name of the textregion (in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
 
    Returns
    -------
    new_dic: 
        key:    New Ids for the new reading order
        values  A list of strings, which have the name of the textregions, which are now combined in one bigger region. 
    """
    
    dic = dict(sorted(dic.items(), key=lambda item: int(item[1][1]))) #sort by reading order
    new_dic = {}
   
    count = 0
    
    for i,(k,v) in enumerate(list(dic.items())):
       
    
        #two dummyelements for very first and the very last textregion on the page
        if i==0:
            last_entry = [0,["0,0",0]]
        else:
            last_entry = list(dic.items())[i-1]
        
        if i==len(dic.items())-1:
            next_entry = [0,["0,0",0]]
        else:
            next_entry = list(dic.items())[i+1]
        
        
        #here we calculate the top left and bottom right points for the last, the present and the next bounding bos
        start,end = get_start_end(v[0])
        n_start, n_end = get_start_end(next_entry[1][0])
        l_start, l_end = get_start_end(last_entry[1][0])
        
        
       
        if check_if_near(start[0],n_start[0], end[0], n_end[0], abs(end[1]-n_start[1])) and (int(v[1])+1 == next_entry[1][1]):
           
            if count not in new_dic:
                
                new_dic[count]=[k]
            else: 
                new_dic[count].append(k)
        
        elif check_if_near(l_start[0],start[0], l_end[0], end[0], abs(l_end[1]-start[1])) and (int(v[1])-1 == last_entry[1][1]):
            if count not in new_dic:
                
                new_dic[count]=[k]
            else: 
                new_dic[count].append(k)
            
            count+=1
        else:
            count+=1
            new_dic[count]=[k]
            count+=1
     
    #rename the keys of the dictionary for a new clean order
    new_dic = {index: value for index, (_, value) in enumerate(new_dic.items())}
    
    new_dic = possible_headline_in_column(dic, new_dic)
    
    new_dic = {index: value for index, (_, value) in enumerate(new_dic.items())}
    
    return new_dic
    
        
    

In [5]:
combine_regions_in_column(regs)

NameError: name 'regs' is not defined

In [6]:
# hier eventuell noch ergänzen das es auch 3 Überschriftenlevel geben kann (haupt, unter und ort/zeit angabe)

def possible_headline_in_column(dic, combined):
    
    """This function checks if one or two textregions above other textregions could be a title and a subtitle, but only for one column headlines!

    Parameters
    ----------
    dic:  dicitonary
        key:    is the name of the textregion in the Page-XML (in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
    combined: dictionary
        key:    Readingorder ID for the textregions
        values: Its a list. The list contains the regions of already combined regions (from heuristics before)
    
    Returns
    -------
    new: 
        a dictionary quite similar to the dictionary "combined". But now possible headers and there matching textareas are combined in one entry in the dictionary
    """

    new = {k:v for k,v in combined.items()}
    
    
    for i,(k,v) in enumerate(list(combined.items())):
        if i==0:
            continue
        
        last_entry = combined[i-1]
        
        #skip headlines that are possible multicolumn headlines
        start,end = get_start_end(dic[v[0]][0])
        if end[0]-start[0] > 420:
            continue 
        
        
        # in this section we try to combine Title and subtitle. But also Title and text if there is no subtitle
        if len(last_entry) == 1:
            start_sec,end_sec= get_start_end(dic[v[0]][0])
            start_first_headline, end_first_headline = get_start_end(dic[last_entry[0]][0])
            
            #here you can change the parameters for the headline bounding boxes
            if (start_sec[0]-220 <= start_first_headline[0] <= end_sec[0]) and (start_sec[0] <= end_first_headline[0] <= start_sec[0]+400) and (abs(start_first_headline[1]-start_sec[1])<70):
                    new[k].insert(0,last_entry[0])
                    new[k-1].remove(last_entry[0])
        
        
        #this sections trys to combine a recognized title and subtitle region with the matching textregion
        if len(last_entry) == 2:
            start_sec,end_sec= get_start_end(dic[v[0]][0])
            start_first_headline, end_first_headline = get_start_end(dic[last_entry[0]][0])

            if (start_sec[0]-40 <= start_first_headline[0] <= end_sec[0]) and (start_sec[0] <= end_first_headline[0] <= end_sec[0]+40) and (abs(start_first_headline[1]-start_sec[1])<100):
                    new[k].insert(0,last_entry[0])
                    new[k].insert(1,last_entry[1])
                    new[k-1].remove(last_entry[0])
                    new[k-1].remove(last_entry[0])

    new={k: v for k, v in new.items() if len(v)>0} 
    return new
    


In [7]:


def get_start_end(coord_list):
    
    """Gets the top left corner and the right bottom corner of each bounding box"

    Parameters
    ----------
    coord_list : str
        Its a list of coordinates from a Page-XML file for the region of text (polygonomial)

    Returns
    -------
    start:
        a tuple of the smallest x and y value of the region (top left corner)
    end:
        a tuple of the maximum x and y value of the region (bottom right corner)
    """
    
    coord_list = coord_list.split(" ")
    coord_list = [tuple(map(int,tuple(x.split(",")))) for  x in coord_list]
    x_max = max(coord_list,key=lambda x:x[0])
    x_min = min(coord_list,key=lambda x:x[0])
    y_max = max(coord_list,key=lambda x:x[1])
    y_min = min(coord_list,key=lambda x:x[1])
        
        
    ## Coordinate System starts in the top left corner of the picture!
    start = (x_min[0],y_min[1])
    end = (x_max[0], y_max[1])
    
    return start,end


In [17]:
def possible_headline_multicolumn(dic, combined,textlines):
    
    """This function checks if one or two textregions above other textregions could be a title and a subtitle, but only for multicolumn headlines!

    Parameters
    ----------
    dic:  dicitonary
        key:    Is the name of the textregion of the Page-XML(in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
    combined: dictionary
        key:    Readingorder ID for the textregions, because there are now combined, and the bigger combined regions gets a new id
        values: Its a list. The list contains the regions of already combined regions (from heuristics before)
    textlines: dictionary
        key:    textregion id of the page_xml
        values: List of all lines associated to the textregion
    
    Returns
    -------
    new: 
        A dictionary where the values of the keys can be one or more regions which can be seen as headlines
    """
    
    
    avg_height = sum([textline_heights(reg,textlines) for k,v in combined.items() if len(v) > 1 for reg in v ],[])
    avg_height = sum(avg_height)/len(avg_height)

    
    
    headlines = []
    for k,v in combined.items():
        if len(v) == 1:   
            #check if less then 4 textlines in headline (otherwise probability is high that the region is not a headline)
            if v[0] in textlines:
                if len(textlines[v[0]])<4:
                    bigger = False
                    for line in textline_heights(v[0], textlines):
                        #check if the textline are at least 20% higher then the average textline
                        if line > (avg_height * 1.2):
                            bigger = True
                        else:
                            bigger = False
                            break
                    if bigger:
                        headlines.append(v[0])
    
    titles={}
    for i,title in enumerate(headlines):

        if i == 0:
            titles[i] = [title]
        last_region = headlines[i-1]
        last_start,last_end = get_start_end(dic[last_region][0])
        start,end = get_start_end(dic[title][0])
        
        x_diff = abs(start[0]-last_start[0]) + abs(end[0]-last_end[0])
        y_diff = abs(last_end[1]-start[1])
        
        if x_diff < 500 and y_diff < 20:
            titles[i] = [last_region,title]
            if i!= 0:
                del titles[i-1]
            
        else:
            titles[i] = [title]

    
    titles = {index: value for index, (_, value) in enumerate(titles.items())}
    print(combined)
    print(titles)    
    return titles

In [9]:
def textline_heights(region, textlines):
    
    """Gets the textline heights (the heigths of a bounding box) of one region

    Parameters
    ----------
    region:    string
        A region of the PAGE-XML
    textlines: dictionary
        key:    textregion id of the page_xml
        values:List of all lines associated to the textregiony)
    
    Returns
    -------
    heights:   list
        A list of all textline-heights (in Pixels) of the textregion
    """
    
    
    heights = []
    if region in textlines:
        for textline in textlines[region]:
            start,end = get_start_end(textline[1])
            heights.append(end[1]-start[1])
        
    return heights

In [128]:
def combine_multicolumn_text_and_headlines(dic, combined,headlines):
    
    """This function combines a multicolumn headline with the associated textregion beneath it.

    Parameters
    ----------
    dic:  dicitonary
        key:    Is the name of the textregion of the Page-XML(in the format: "region_00xx")
        values: Its a list. List[0] contains all coordinates which the segmentor recognized as textarea. List[1] is the reading order
    combined: dictionary
        key:    Readingorder ID for the textregions, because there are now combined, and the bigger combined regions gets a new id
        values: Its a list. The list contains the regions of already combined regions (from heuristics before)
    headlines: dictionary
        key:    Placeholder IDs. They have no function. 
        values: List of Strings, which are the regions which belong to the headline
        
    Returns
    -------
    new: 
        A dictionary where now multicolumn headlines and Textregions on the most left column under the title are combined.
    """
    
    
    new = deepcopy(combined)
    
    for k,v in combined.items():
        for key,value in headlines.items():
            
            if v[0] != value[0]:
                start_title, end_title = get_start_end(dic[value[0]][0])
                start_text, end_text = get_start_end(dic[v[0]][0])
    
                x_diff = abs(start_title[0]-start_text[0])
                y_diff = start_text[1]-end_title[1]
                
                if x_diff < 200 and 0<=y_diff<=100:
                    if len(value)==2:
                        new[k].insert(0, value[1])
                        new[k].insert(0, value[0])
                    else:
                        new[k].insert(0, value[0])

    all_headers = sum(headers.values(),[])
    for k,v in new.items():
        if all(elem in all_headers for elem in v):
            new[k]=[]
      
    new = {k: v for k, v in new.items() if len(v)>0} 
    new = {index: value for index, (_, value) in enumerate(new.items())}
    return new

# Main Part

Reading the Page-XML of eynollah, drawing bounding boxes arround recognized textregions and apply some functions to combine those regions to articles.


In [130]:
count= 1
for filename in os.listdir(path):
    if not filename.endswith('.xml'): continue
    fullname = os.path.join(path, filename)

    with open(fullname, 'r') as f:
        data = f.read()
        data= BeautifulSoup(data, "xml")
        
        #extract Text regions from the PAGE-XML
        regions = data.find_all("TextRegion")
        #extract Separatorregions from PAGE-XML
        separator_list = data.find_all("SeparatorRegion")
        #extract the Reading order of each Textregion
        order = {x["regionRef"]:x["index"]for x in data.find_all("RegionRefIndexed")} #this is already the sorted order beginning from order[0]
   
    number = fullname.split("/")[-1].split(".")[0]
    print(fullname)
    
    #dictionary to get access to the coordinate of all lines of all textregions
    textlines={}
    for reg in regions:
        for line in reg.find_all("TextLine"):
            if reg.get("id") not in textlines:
                textlines[reg.get("id")] = [[line.get("id"),line.find("Coords")["points"]]] 
            else: 
                textlines[reg.get("id")].append([line.get("id"),line.find("Coords")["points"]])
                
    #dictionary for all textregions and their coordinates
    regs={}
    for reg in regions:
        regs[reg.get("id")] = [reg.find("Coords")["points"],order[reg.get("id")]]
    
    #list of all coordinate of the separators
    separators=[]
    for separ in separator_list:
        separators.append(separ.find("Coords")["points"])
        
    #delete all too small wrong annotated regions
    regs = delete_small(regs)
    
    #combined regions in one column
    combined = combine_regions_in_column(regs)
    
    #get headlines
    headers = possible_headline_multicolumn(regs, combined, textlines)
    
    #combine multicolumn
    combined = combine_multicolumn_text_and_headlines(regs, combined, headers)
    
 
    img = cv2.imread(path + number + ".jpg")
    overlay = img.copy()
    
    #1. draw the separators
    for sep in separators:
        start,end = get_start_end(sep)
        
        #draws a red rectangle arround the separator
        img = cv2.rectangle(img, start, end, (0,0,255), -1)


    """
    for k,v in regs.items():
        
        #v[0] is the coordinate list, v[1] is the reading order rank
        start,end = get_start_end(v[0])


        #assigns a random colour to each rectangle
        col = tuple(random.randrange(256) for _ in range(3))
        img = cv2.rectangle(img, start, end, col, -1)

        #puts the reading order to the rectangles
        
        #img = cv2.putText(img, str(v[1]), (start[0]+5,start[1]+20),  cv2.FONT_HERSHEY_SIMPLEX, 1, (0,0,0), 2)
   
    """
    #2. draw the Textregion rectangles
    for k,v in combined.items():
        
        #get a random colour for each textregion
        col = tuple(random.randrange(256) for _ in range(3))
        textposition = (get_start_end(regs[v[0]][0]))        
        
        for reg in v:
        
            #v[0] is the coordinate list, v[1] is the reading order rank
            start,end = get_start_end(regs[reg][0])

            #assigns a random colour to each rectangle and draws it arround the textregions
            img = cv2.rectangle(img, start, end, col, -1)
            
            if reg not in textlines:
                continue
            else:
                for line in textlines[reg]:
                    s,e = get_start_end(line[1])
                    
                    #puts small red rectangles arround the textlines
                    img = cv2.rectangle(img, s, e, (0,0,255), 1)
                    
        #3. put the reading order to the rectangles
        img = cv2.putText(img, str(k), (textposition[0][0]+5, textposition[0][1]+25),  cv2.FONT_HERSHEY_SIMPLEX, 1, (255,0,0), 4)
    
    
    alpha =0.4
    #creates a certain opacity for the rectanlge colours
    img = cv2.addWeighted(overlay, alpha, img, 1 - alpha, 0)

    result = output + "result_" +number +".jpg"
    print(result)
    cv2.imwrite(result, img)
    count +=1
    

croped/croped_00000009.xml
{0: ['region_0017'], 1: ['region_0053'], 2: ['region_0052'], 3: ['region_0025', 'region_0020', 'region_0048', 'region_0043', 'region_0038', 'region_0042', 'region_0032', 'region_0047', 'region_0037'], 4: ['region_0039'], 5: ['region_0034', 'region_0036', 'region_0027', 'region_0029', 'region_0040'], 6: ['region_0021', 'region_0033'], 7: ['region_0019', 'region_0045'], 8: ['region_0028'], 9: ['region_0015'], 10: ['region_0023'], 11: ['region_0014'], 12: ['region_0044'], 13: ['region_0016', 'region_0051'], 14: ['region_0026', 'region_0031', 'region_0049'], 15: ['region_0041', 'region_0035', 'region_0046', 'region_0030'], 16: ['region_0050'], 17: ['region_0022'], 18: ['region_0024'], 19: ['region_0018']}
{0: ['region_0053'], 1: ['region_0039'], 2: ['region_0028', 'region_0015'], 3: ['region_0022']}
{0: ['region_0017'], 1: ['region_0053'], 2: ['region_0052'], 3: ['region_0025', 'region_0020', 'region_0048', 'region_0043', 'region_0038', 'region_0042', 'region_003

croped_colored/result_croped_00000017.jpg
croped/croped_00000018.xml
{0: ['region_0012'], 1: ['region_0017'], 2: ['region_0015'], 3: ['region_0016', 'region_0035', 'region_0046', 'region_0037', 'region_0047', 'region_0041', 'region_0055'], 4: ['region_0010'], 5: ['region_0023', 'region_0033', 'region_0029', 'region_0027'], 6: ['region_0057'], 7: ['region_0050', 'region_0043', 'region_0038'], 8: ['region_0034', 'region_0028'], 9: ['region_0013', 'region_0031', 'region_0051'], 10: ['region_0058'], 11: ['region_0049'], 12: ['region_0011', 'region_0021', 'region_0026'], 13: ['region_0053'], 14: ['region_0056'], 15: ['region_0025'], 16: ['region_0045'], 17: ['region_0019', 'region_0042'], 18: ['region_0036', 'region_0022', 'region_0039'], 19: ['region_0014', 'region_0030'], 20: ['region_0054', 'region_0032'], 21: ['region_0024', 'region_0044'], 22: ['region_0020', 'region_0052'], 23: ['region_0018', 'region_0048'], 24: ['region_0040']}
{0: ['region_0017'], 1: ['region_0015'], 2: ['region_00

croped_colored/result_croped_00000016.jpg
croped/croped_00000001.xml
{0: ['region_0048'], 1: ['region_0043'], 2: ['region_0041'], 3: ['region_0036', 'region_0052'], 4: ['region_0046'], 5: ['region_0063'], 6: ['region_0054'], 7: ['region_0061'], 8: ['region_0042'], 9: ['region_0039'], 10: ['region_0049'], 11: ['region_0059'], 12: ['region_0086'], 13: ['region_0056'], 14: ['region_0057'], 15: ['region_0079', 'region_0078', 'region_0058', 'region_0077', 'region_0075'], 16: ['region_0047', 'region_0038', 'region_0066', 'region_0072'], 17: ['region_0053', 'region_0064', 'region_0083'], 18: ['region_0068'], 19: ['region_0070', 'region_0076', 'region_0080', 'region_0055'], 20: ['region_0071', 'region_0050', 'region_0073', 'region_0051', 'region_0067'], 21: ['region_0081'], 22: ['region_0045', 'region_0037', 'region_0062', 'region_0074'], 23: ['region_0060'], 24: ['region_0044', 'region_0040', 'region_0084', 'region_0082', 'region_0065', 'region_0085', 'region_0069']}
{0: ['region_0046'], 1: [

croped_colored/result_croped_00000012.jpg
croped/croped_00000004.xml
{0: ['region_0042'], 1: ['region_0118'], 2: ['region_0053'], 3: ['region_0058', 'region_0036', 'region_0108'], 4: ['region_0062', 'region_0099'], 5: ['region_0064', 'region_0041', 'region_0105', 'region_0103'], 6: ['region_0089'], 7: ['region_0061', 'region_0097', 'region_0083', 'region_0087'], 8: ['region_0078'], 9: ['region_0037', 'region_0092', 'region_0106', 'region_0096'], 10: ['region_0104', 'region_0088'], 11: ['region_0086'], 12: ['region_0107', 'region_0090', 'region_0093'], 13: ['region_0050', 'region_0101', 'region_0095', 'region_0100', 'region_0094', 'region_0098', 'region_0102'], 14: ['region_0085'], 15: ['region_0046'], 16: ['region_0116'], 17: ['region_0117'], 18: ['region_0119'], 19: ['region_0034'], 20: ['region_0091'], 21: ['region_0045'], 22: ['region_0055'], 23: ['region_0051'], 24: ['region_0040'], 25: ['region_0077'], 26: ['region_0076'], 27: ['region_0067', 'region_0048'], 28: ['region_0038'], 2

croped_colored/result_croped_00000008.jpg
croped/croped_00000006.xml
{0: ['region_0066'], 1: ['region_0042'], 2: ['region_0026'], 3: ['region_0038', 'region_0037'], 4: ['region_0008', 'region_0051', 'region_0046', 'region_0023'], 5: ['region_0067'], 6: ['region_0044'], 7: ['region_0009', 'region_0049', 'region_0053'], 8: ['region_0012', 'region_0028'], 9: ['region_0048', 'region_0050'], 10: ['region_0011', 'region_0054', 'region_0027'], 11: ['region_0018', 'region_0056', 'region_0052', 'region_0040', 'region_0061'], 12: ['region_0014', 'region_0063'], 13: ['region_0029'], 14: ['region_0020'], 15: ['region_0041', 'region_0034', 'region_0060', 'region_0024'], 16: ['region_0022'], 17: ['region_0019', 'region_0013', 'region_0055', 'region_0043', 'region_0021'], 18: ['region_0058', 'region_0030', 'region_0059'], 19: ['region_0064', 'region_0025'], 20: ['region_0010', 'region_0015', 'region_0045', 'region_0035', 'region_0057', 'region_0033'], 21: ['region_0017', 'region_0036', 'region_0039',

## nächsten Schritte:

-spaltenende Artikel in nächster Spalte weiter?
