In [11]:
import pandas as pd
import numpy as np
from PIL import Image

In [12]:
# Load and filter annotation data
df = pd.read_csv('NeuralNetworkWIP/annotations.csv', delimiter='\t')
df = df[df['handwritten_text'].notnull()]
df

Unnamed: 0,text_number,file_name,file_path,area_transitions,bar_1_start,bar_1_end,bar_2_start,bar_2_end,bar_3_start,bar_3_end,print_text,handwritten_text
0,a06-119,a06-119.png,/084/a06-119.png,"[355, 363, 605, 613, 2795, 2803]",355,363,605,613,2795,2803,Note circulation soared for the sixth successi...,Note circulation soared for the\nsixth success...
1,a06-128,a06-128.png,/084/a06-128.png,"[355, 363, 656, 664, 2795, 2803]",355,363,656,664,2795,2803,Banks have paid in a first instalment of almos...,Banks have paid in a first\ninstalment of almo...
6,a01-043,a01-043x.png,/011/a01-043x.png,"[376, 384, 676, 683, 2804, 2812]",376,384,676,683,2804,2812,Informal talks at Lancaster House will resume ...,Informal talks at Lancaster House will resume\...
7,a01-038,a01-038x.png,/011/a01-038x.png,"[377, 384, 725, 732, 2806, 2813]",377,384,725,732,2806,2813,Mr. Macleod thought the two Rhodesian parties ...,Mr. Macleod thought the two Rhodesian\nparties...
8,a02-078,a02-078.png,/034/a02-078.png,"[354, 361, 704, 712, 2795, 2802]",354,361,704,712,2795,2802,SIR Roy Welensky said today that he no longer ...,SIR Roy Welensky said today that he no\nlonger...
...,...,...,...,...,...,...,...,...,...,...,...,...
1528,c04-170,c04-170.png,/172/c04-170.png,"[359, 366, 559, 567, 2797, 2805]",359,366,559,567,2797,2805,Shelagh Delaney and Alan Sillitoe attacked edu...,Shelagh Delaney and Alan Sillitoe attacked\ned...
1530,a01-020,a01-020.png,/009/a01-020.png,"[362, 370, 663, 670, 2802, 2810]",362,370,663,670,2802,2810,Mr. Macleod went on with the conference at Lan...,Mr. Macleod went on with the conference at\nLa...
1531,a01-026,a01-026.png,/009/a01-026.png,"[361, 369, 711, 719, 2801, 2809]",361,369,711,719,2801,2809,"MR. IAIN MACLEOD, the Colonial Secretary, deni...","MR. IAIN MACLEOD, the Colonial Secretary, deni..."
1535,a01-026,a01-026x.png,/008/a01-026x.png,"[375, 383, 723, 730, 2805, 2812]",375,383,723,730,2805,2812,"MR. IAIN MACLEOD, the Colonial Secretary, deni...","MR. IAIN MACLEOD, the Colonial Secretary, deni..."


### Image Segmentation

In [13]:
# Turn image into black and white
def black_and_white(image: Image, threshold=200):
    fn = lambda x : 255 if x > threshold else 0
    image = image.convert('L').point(fn, mode='1')
    return image

# Horizontal projection
def horizontal_projection(image: Image):
    pixels = np.array(image)
    horizontal_projection = np.sum(255 - pixels, axis=1)
    return horizontal_projection

# Normalize values so that they are all between 0 and 1
def normalize_projection(projection):
    minimum, maximum = min(projection), max(projection)
    span = maximum - minimum
    normalized = [(value - minimum) / span for value in projection]
    return normalized

# Remove noise by setting every value below a certain threshold to 0
def cut_noise_from_projection(projection, threshold=0.05):
    projection = [value if value > threshold else 0 for value in projection]
    return projection

# Smooth out a the projection
def smoothen_projection(data, window_len=70):
    kernel = np.ones(window_len, 'd')
    smoothed_out = np.convolve(kernel/kernel.sum(), data, mode='same')
    return smoothed_out


In [14]:
# Calculate Local Minima
def calculate_local_minima(image: Image):
    image = black_and_white(image)

    projection = horizontal_projection(image)
    projection = normalize_projection(projection)
    projection = smoothen_projection(projection)
    projection = cut_noise_from_projection(projection)
    projection = smoothen_projection(projection)

    # Signum of the first order "derivative"
    grad_sign = []
    for i in range(len(projection)-1):
        delta = projection[i+1] - projection[i]
        grad_sign.append(np.sign(delta))

    # Collect minima based on the second order "derivative"
    minima = []
    for i in range(len(projection)-2):
        if grad_sign[i+1] - grad_sign[i] > 0:
            minima.append(i+1)
    return minima


In [15]:
# Create the list of segments
def calculate_segments(image: Image, threshold=100):
    minima = calculate_local_minima(image)
    cropped = []
    for i in range(len(minima)-1):
        m1 = minima[i]
        m2 = minima[i+1]
        if m2 - m1 > threshold:
            cropped.append((m1, m2))
    return cropped

In [16]:
class Segment():
    def __init__(self, text_number: str, file_name: str, file_path: str, segment_start: int, segment_end:int, segment_text: str=None):
        self.text_number = text_number
        self.file_name = file_name
        self.file_path = file_path
        self.segment_start = segment_start
        self.segment_end = segment_end
        self.segment_text = segment_text

    def segment_image(self):
        full_image = Image.open(self.file_path)
        _, width = full_image.size
        return full_image.crop((0, self.segment_start, width, self.segment_end))

### Precomputing Segmentation and Storing it in a CSV

In [17]:
root_path = "/home/jan/.cache/kagglehub/datasets/naderabdelghany/iam-handwritten-forms-dataset/versions/1/data"

segments = []
for file_idx, row in df.iterrows():
    # Load Image
    full_path = root_path + row['file_path']
    image = Image.open(full_path)

    # Crop image to handwritten area
    handwritten_start = row['bar_2_end']
    handwritten_end = row['bar_3_start']
    _, width = image.size
    image = image.crop((0, handwritten_start, width, handwritten_end))

    # Get lists of line and text segments
    line_segments = calculate_segments(image)
    text_lines = row['handwritten_text'].split('\n')[:-1]

    # Fill list of segments
    minimum = min(len(text_lines), len(line_segments))
    for seg_idx in range(minimum):
        seg = Segment(text_number=row['text_number'],
                      file_name=row['file_name'],
                      file_path=row['file_path'],
                      segment_start=handwritten_start+line_segments[seg_idx][0],
                      segment_end=handwritten_start+line_segments[seg_idx][1],
                      segment_text=text_lines[seg_idx])
        segments.append(seg)

In [18]:
text_numbers = []
file_names = []
file_paths = []
segment_starts = []
segment_ends = []
segment_texts = []

for segment in segments:
    text_numbers.append(segment.text_number)
    file_names.append(segment.file_name)
    file_paths.append(segment.file_path)
    segment_starts.append(segment.segment_start)
    segment_ends.append(segment.segment_end)
    segment_texts.append(segment.segment_text)

In [19]:
segment_df = pd.DataFrame({'text_number': text_numbers,
                           'file_name': file_names,
                           'file_path': file_paths,
                           'segment_start': segment_starts,
                           'segment_end': segment_ends,
                           'segment_text': segment_texts})
segment_df

Unnamed: 0,text_number,file_name,file_path,segment_start,segment_end,segment_text
0,a06-119,a06-119.png,/084/a06-119.png,718,893,Note circulation soared for the
1,a06-119,a06-119.png,/084/a06-119.png,901,1070,sixth successive week - this
2,a06-119,a06-119.png,/084/a06-119.png,1077,1250,"time by more than 15,000,000 last"
3,a06-119,a06-119.png,/084/a06-119.png,1259,1432,week. And that brought the figure
4,a06-119,a06-119.png,/084/a06-119.png,1438,1614,"to a record 2,415,000,000. This was"
...,...,...,...,...,...,...
4352,a01-014,a01-014x.png,/008/a01-014x.png,1745,1926,and the talks fall through. There are bound
4353,a01-014,a01-014x.png,/008/a01-014x.png,1926,2108,to be demonstrations.” Yesterday Sir Roy’s
4354,a01-014,a01-014x.png,/008/a01-014x.png,2108,2289,"chief aide, Mr. Julius Greenfield, telephoned"
4355,a01-014,a01-014x.png,/008/a01-014x.png,2289,2463,his chief a report on his talks with Mr.


In [20]:
#segment_df.to_csv('NeuralNetworkWIP/segments.csv', sep='\t', index=False)