## Imports

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import sys
import numpy as np
from torch import nn
from torchvision.transforms.functional import pil_to_tensor
from torchvision import datasets, transforms #currently unused
import random
import matplotlib.pyplot as plt
from PIL import Image, ImageOps, ImageFilter
from scipy.signal import argrelextrema
import pandas as pd
print("ok")


KeyboardInterrupt: 

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "xpu"
    if torch.xpu.is_available()
    else "cpu"
)
print(f"Using {device} device")


Using cpu device


## Parameters and Hyperparameters

In [None]:
num_epochs = 10
img_width = 200
img_height = 50
num_characters = 26 #TODO: change for function

## create segment

In [None]:
from utils import SplitImage

si = SplitImage("src/pmml_project/img/a01-043.png")
handwritten_area = si.handwritten_area()
handwritten_area.save('handwritten-a01-043.png')



### Create the horizontal projection of gray values

In [None]:
img = Image.open("handwritten-a01-043.png")
pixels = np.array(img)
horizontal_projection = np.sum(255 - pixels, axis=1)
#plt.plot(horizontal_projection)
plt.imshow(img, cmap='gray')
plt.show()

### Find local minima

In [None]:
# code from https://www.kaggle.com/code/irinaabdullaeva/text-segmentation

def smooth(x, window_len=70, window='hanning'):
#     if x.ndim != 1:
#         raise ValueError("smooth only accepts 1 dimension arrays.") 
    if x.size < window_len:
        raise ValueError("Input vector needs to be bigger than window size.") 
    if window_len<3:
        return x
    if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
        raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'")
    s = np.r_[x[window_len-1:0:-1],x,x[-2:-window_len-1:-1]]
    #print(len(s))
    if window == 'flat': #moving average
        w = np.ones(window_len,'d')
    else:
        w = eval('np.'+window+'(window_len)')

    y = np.convolve(w/w.sum(),s,mode='valid')
    return y

smoothed = smooth(horizontal_projection, 45, window='flat')
plt.plot(smoothed)

local_minima = argrelextrema(smoothed, np.less)
local_minima = np.array(local_minima).flatten()

### Cropping lines

In [None]:
def crop_lines(local_minima, threshold=0):
    x1 = 0
    cropped = []
    diff = []
    for i, min in enumerate(local_minima):
        x2 = min
        #print(f"x1 = {x1}, x2 = {x2}, diff = {x2-x1}")
        if x2-x1 >= threshold:
            cropped.append((x1, x2))
        x1 = min
    return cropped

def show_cropped_lines(img, cropped):
    plots = len(cropped)
    for i, l in enumerate(cropped):
        line = img[l[0]:l[1]]
        plt.subplot(plots, 1, i+1)
        plt.axis('off')
        _ = plt.imshow(line, cmap='gray')
        plt.xticks([]), plt.yticks([])  # to hide tick values on X and Y axis

cropped = crop_lines(local_minima, 100)
cropped = [(int(x1), int(x2)) for x1, x2 in cropped]
print(cropped)
show_cropped_lines(pixels, cropped)

### Text encoding/decoding

In [3]:
chars = ['\n', ' ', '!', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¿', '’', '“', '”', '„', '<PAD>']

def encode(text: str):
    encoded_text = []
    for char in text:
        encoded_text.append(chars.index(char))
    return encoded_text

def decode(char_ids: list):
    decoded_text = ''
    for id in char_ids:
        if id != 81: # char_id 81 corresponds to the padding token
            decoded_text += chars[id]
    return decoded_text

def pad_ids(char_ids: list, length: int):
    list_len = len(char_ids)
    if list_len < length:
        char_ids += [81] * (length - list_len)
    print(len(char_ids))
    return char_ids

### Preparing the Dataset

In [None]:
df = pd.read_csv("segments.csv", delimiter="\t")
df

In [None]:
#define nn datastructure
class OCR_dataset(Dataset):
    def __init__(self, df: pd.DataFrame, root_path: str):
        self.df = df
        self.root_path = root_path

    def __len__(self):
        return len(df)

    def __getitem__(self, idx):
        path = self.root_path + self.df.iloc[idx]['file_path']
        start = self.df.iloc[idx]['segment_start']
        end = self.df.iloc[idx]['segment_end']
        text = self.df.iloc[idx]['segment_text']

        target = encode(text)
        target = pad_ids(text)

        image = Image.open(path)
        width, _ = image.size
        image = image.crop((0, start, width, end))
        image = pil_to_tensor(image)
        return image, target

In [None]:
root_path = "/home/jan/.cache/kagglehub/datasets/naderabdelghany/iam-handwritten-forms-dataset/versions/1/data"
data = OCR_dataset(df, root_path)
generator = torch.Generator().manual_seed(299792458) # Generator for reproducability
train_data, eval_data, test_data = random_split(data, [.8, .1, .1], generator)

train_dataloader = DataLoader(
    dataset=train_data,
    batch_size=4,
    shuffle=True,
    num_workers=4
)
test_dataloader = DataLoader(
    dataset=eval_data,
    batch_size=4,
    shuffle=True,
    num_workers=4
)
eval_dataloader = DataLoader(
    dataset=test_data,
    batch_size=4,
    shuffle=True,
    num_workers=4
)

### The Model

In [None]:
#define nn layers
class OCR_neural_network(nn.Module):
    def __init__(self, img_width, img_height, num_characters):
        super().__init__()
        self.rnn_height = img_height//4
        self.rnn_width = img_width//4
        self.rnn_feature_number = self.rnn_height * 64
        
        self.conv_pooling_stack = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding = 1),
            nn.ReLu(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding = 1),
            nn.ReLu(),
            nn.MaxPool2d(2)
        )

        self.after_resize_stack = nn.Sequential(
            nn.Linear(self.rnn_feature_number, 64),
            nn.ReLu(),
            nn.Dropout(0.2)
        )
            
        self.rnn1 = nn.LSTM(64, 128, batch_first = True, bidirectional = True, dropout = 0.25)
        self.rnn2 = nn.LSTM(256, 64, batch_first = True, bidirectional = True, dropout = 0.25)

        self.output_layer = nn.Linear(128, num_characters)
        
        def forward(self, x):
            batch_size, seq_len, channels, height = x.size()
            
            x = self.conv_pooling_stack(x)
    
            #reshape for rnn
            x = x.permute(0, 3, 1, 2)
            x = torch.reshape(x, (batch_size, seq_len, channels * height))    
            
            x = self.after_resize_stack(x)
    
            x, y = self.rnn1(x) #y is not used
            x, y = self.rnn2(x)
    
            x = self.output_layer(x)
            return x



In [None]:
#create instance of model
model = OCR_neural_network(img_width, img_height, num_characters)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

In [None]:
#train NN
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for image_data, label in training_dataloader:
        optimizer.zero_grad()
        outputs = model(image_data)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        running_loss += loss

    

## LLM 


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

llm_model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForCausalLM.from_pretrained(
    llm_model_name,
    torch_dtype="auto",
    device_map=None
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
input_text = "Note circalation soared for the sixth successive week - "\
"thir time by more than 15,000,000 last weet. "\
"And that brought the fiyure to a record 2,415,000,000." \
"This was 100,000,000 more than the corresponding week last year and 37,000,000 up on"\
"the 1960 record set last Christmus."\
"Now look at the other side of all"\
"these coins."


# prepare the model input
prompt = "You are a text corrector. Only correct spelling and punctuation. Do not edit content. Do not rephrase. Only output the corrected text." \
            f"Input text: {input_text}"
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 



output = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")

print("input:\n", input_text)
print("output:\n", output)




input:
 Note circalation soared for the sixth successive week - thir time by more than 15,000,000 last weet. And that brought the fiyure to a record 2,415,000,000.This was 100,000,000 more than the corresponding week last year and 37,000,000 up onthe 1960 record set last Christmus.Now look at the other side of allthese coins.
output:
 Note circulation soared for the sixth successive week - third time by more than 15,000,000 last week. And that brought the figure to a record 2,415,000,000. This was 100,000,000 more than the corresponding week last year and 37,000,000 up on the 1960 record set last Christmas. Now look at the other side of all these coins.
