## Imports

In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
import torchvision.transforms.functional as TF
import pandas as pd
from PIL import Image
import Levenshtein
from transformers import AutoModelForCausalLM, AutoTokenizer

import kagglehub

print("ok")

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'kagglehub'

In [41]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "xpu"
    if torch.xpu.is_available()
    else "cpu"
)

print(f"Using {device} device")

Using cpu device


## Parameters and Hyperparameters

In [42]:
chars_set = ['\n', ' ', '!', '(', ')', '*', ',', '-', '.', '/',
             '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
             ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
             'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
             'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a',
             'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
             'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
             'v', 'w', 'x', 'y', 'z', '¿', '’', '“', '”', '„',
             '<PAD>']

OCR_CONFIG = {'img_width': 1536,
              'img_height': 128,
              'char_set_size': len(chars_set), # 80 + 1 for padding
              'drop_rate': .2,
              'batch_size': 8}

OCR_CONFIG["max_out_len"] = OCR_CONFIG["img_width"] // 4

### Text encoding/decoding

In [54]:
def encode(text: str):
    encoded_text = []
    for char in text:
        encoded_text.append(chars_set.index(char))
    return encoded_text

def decode(char_ids: list):
    decoded_text = ''
    for id in char_ids:
        if id != 81: # char_id 81 corresponds to the padding token
            decoded_text += chars_set[id]
    return decoded_text

def pad_ids(char_ids: list, length: int):
    list_len = len(char_ids)
    if list_len < length:
        char_ids += [80] * (length - list_len)
    return char_ids

### Preparing the Dataset

In [55]:
#define nn datastructure
class OCR_dataset(Dataset):
    def __init__(self, df: pd.DataFrame, root_path: str):
        self.df = df
        self.root_path = root_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.root_path + self.df.iloc[idx]['file_path']
        start = self.df.iloc[idx]['segment_start']
        end = self.df.iloc[idx]['segment_end']
        text = self.df.iloc[idx]['segment_text']

        target = encode(text)
        target = pad_ids(target, OCR_CONFIG["max_out_len"]) # should actually by size of charset
        target = torch.LongTensor(target)

        image = Image.open(path)
        width, _ = image.size
        image = image.crop((0, start, width, end))
        image = image.resize((OCR_CONFIG['img_width'], OCR_CONFIG['img_height']))
        image = TF.to_tensor(image)
        return image, target

In [56]:
root_path = "iam-handwritten-forms-dataset/versions/1/data"
#root_path = "/home/jan/.cache/kagglehub/datasets/naderabdelghany/iam-handwritten-forms-dataset/versions/1/data"
num_workers = 0


df = pd.read_csv("segments.csv", delimiter="\t")
data = OCR_dataset(df, root_path)
generator = torch.Generator().manual_seed(299792458) # Generator for reproducability
train_data, eval_data, test_data = random_split(data, [.8, .1, .1], generator)

train_dataloader = DataLoader(
    dataset=train_data,
    batch_size=OCR_CONFIG['batch_size'],
    shuffle=True,
    num_workers=num_workers,
    drop_last = True
)
test_dataloader = DataLoader(
    dataset=eval_data,
    batch_size=OCR_CONFIG['batch_size'],
    shuffle=True,
    num_workers=num_workers,
    drop_last = True
)
eval_dataloader = DataLoader(
    dataset=test_data,
    batch_size=OCR_CONFIG['batch_size'],
    shuffle=True,
    num_workers=num_workers,
    drop_last = True
)

### The Model

In [57]:
#define nn layers
class OCR_neural_network(nn.Module):
    def __init__(self, CONFIG: dict):
        super().__init__()
        self.CONFIG = CONFIG
        self.rnn_height = CONFIG['img_height']//4
        self.rnn_width = CONFIG['img_width']//4
        self.rnn_feature_number = self.rnn_height * 64
        
        self.conv_pooling_stack = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.after_resize_stack = nn.Sequential(
            nn.Linear(self.rnn_feature_number, 64),
            nn.ReLU(),
            nn.Dropout(CONFIG['drop_rate'])
        )
            
        self.rnn1 = nn.LSTM(64, 128, batch_first=True, bidirectional=True, dropout=CONFIG['drop_rate'])
        self.rnn2 = nn.LSTM(256, 64, batch_first=True, bidirectional=True, dropout=CONFIG['drop_rate'])

        self.output_layer = nn.Linear(128, CONFIG['max_out_len'])
        
    def forward(self, x):
        x = self.conv_pooling_stack(x)
    
        #reshape for rnn
        x = x.permute(0, 3, 1, 2)
        x = torch.reshape(x, (self.CONFIG['batch_size'], self.rnn_width, self.rnn_feature_number))    
            
        x = self.after_resize_stack(x)
    
        x, y = self.rnn1(x) #y is not used
        x, y = self.rnn2(x)
    
        x = self.output_layer(x)
        return x



In [58]:
#create instance of model
model = OCR_neural_network(OCR_CONFIG)

total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

def create_weight(dataloader):
    weight = torch.Tensor(OCR_CONFIG["char_set_size"])
    for _, label in dataloader:
        counts = [(label == c).sum().item() for c in range(0,81)]   
        print(counts)
    
        
    
create_weight(train_dataloader)
#criterion = nn.CTCLoss(reduction="mean", zero_infinity=true)
criterion = nn.CrossEntropyLoss(reduction="mean", ignore_index=80)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)

Total number of parameters: 563,008
[0, 44, 0, 0, 0, 0, 3, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 20, 2, 5, 10, 34, 5, 3, 16, 11, 0, 3, 12, 4, 13, 13, 7, 0, 16, 18, 19, 5, 4, 5, 1, 8, 0, 0, 0, 0, 0, 0, 2778]
[0, 56, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 21, 3, 2, 7, 37, 6, 8, 17, 21, 2, 2, 12, 8, 20, 17, 2, 0, 18, 17, 28, 11, 4, 6, 0, 10, 0, 0, 0, 0, 0, 0, 2729]
[0, 55, 0, 0, 0, 0, 6, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 2, 0, 1, 2, 2, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 27, 3, 3, 9, 40, 2, 4, 14, 17, 0, 3, 27, 11, 21, 15, 6, 0, 19, 21, 24, 7, 3, 4, 1, 5, 0, 0, 2, 0, 2, 0, 2690]
[0, 40, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 17, 3, 10, 6, 31, 6, 3, 11, 15, 0, 0

KeyboardInterrupt: 

In [51]:
num_epochs = 3
writer = SummaryWriter()

#train NN
global_step = 0
for epoch in range(num_epochs):
    print("epoch" + str(epoch))
    model.train()
    print("model in training mode")
    loss_list = []

    for image_data, label in train_dataloader:
        #print(label.shape)
        optimizer.zero_grad()
        outputs = model(image_data)
        #print(outputs.shape)
        outputs = outputs.permute(0, 2, 1)
        #print(outputs.shape)
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()

        #loss monitoring
        global_step += 1
        writer.add_scalar(tag='Loss/train', scalar_value=loss.item(), global_step=global_step)
        if global_step % 10 == 1:
            print(str(global_step) + ": " + str(loss.item()))

epoch0
model in training mode
1: 3.22160005569458
11: 3.1799278259277344
21: 3.1777195930480957
31: 3.2858378887176514
41: 3.1685969829559326
51: 3.288931369781494
61: 3.102687358856201
71: 3.1711647510528564
81: 3.1874282360076904
91: 3.2494819164276123
101: 3.1617541313171387
111: 3.207866907119751
121: 3.197951555252075
131: 3.233905792236328
141: 3.402174949645996
151: 3.216827392578125
161: 3.123387336730957
171: 3.220952033996582
181: 3.085259199142456
191: 3.2746951580047607
201: 3.186462879180908
211: 3.1120667457580566


KeyboardInterrupt: 

In [50]:
#model testing

def pick_char(output_tensor):
    output_strings = []
    max_val_indices = torch.argmax(output_tensor, dim = 2)
    for batch in max_val_indices:
        output_strings.append(decode(batch))
    
    return output_strings
    
model.eval()
with torch.no_grad():
    for image_data, label in test_dataloader:
        outputs = model(image_data)
        #outputs = outputs.permute(0, 2, 1)

        print(pick_char(outputs))
        print(label)
        


['teeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee', 'teeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee', 'teeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee

KeyboardInterrupt: 

## LLM 


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

llm_model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForCausalLM.from_pretrained(
    llm_model_name,
    torch_dtype="auto",
    device_map=None
)



In [None]:
def combine_into_a_line(lines: object) -> str:
    line = ""
    for l in lines:
        line += f"{l} "
    if line[-1] == " ":         #remove the space added at the end of the line
        line = line[:-1]
    return line

In [None]:
# the input texts are "a06-119" and "a06-128" respectively
input_text = ["Note circalation soared for the sixth successive week - "\
"thir time by more than 15,000,000 last weet. "\
"And that brought the fiyure to a record 2,415,000,000." \
"This was 100,000,000 more than the corresponding week last year and 37,000,000 up on"\
"the 1960 record set last Christmus."\
"Now look at the other side of all"\
"these coins.", 
"Banks have paid in a first"\
"instalment of almost 8,000,000"\
"in respoonse to the Budgette appeal."\
"About another 70,000,000 is due"\
"by Setember 20. For nearly a year"\
"about 150.000,000 has been frozen."\
"MR. KRUSCHEV raises the bogy of"\
"German militarism in his replies to"\
"the West on Berrlin. And he repeats"\
"that the pro`blam ”must be solved"\
"this year.”"]

def llm_process(input_text: list[str]):
# prepare the model input
    llm_output = []
    for i in input_text:        
        prompt = "You are a text corrector. Only correct spelling and punctuation. Do not edit content. Do not rephrase. Only output the corrected text." \
                    f"Input text: {i}"
        messages = [
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        # conduct text completion
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=32768
        )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
        output = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
        llm_output.append(output)

        print("input:\n", i)
        print("output:\n", output)
    return llm_output

llm_output = llm_process(input_text)

In [None]:
import Levenshtein

def accuracy(labels: list[str], llm_output: list[str]):
    if len(labels) != len(llm_output):
        raise ValueError(f"labels and llm_output must be of same size, received {len(labels)} labels and {len(llm_output)} output")
    
    for i in range(len(labels)):
        accuracy = Levenshtein.ratio(labels[i], llm_output[i]) * 100
        print("Accuracy =", accuracy, "%")


In [None]:
def extract_labels(text_number: list[str]):
    labels = []
    for i in text_number:
        text = df[df["text_number"] == i]
        lines = text["segment_text"]
        label = combine_into_a_line(lines)
        labels.append(label)
    return labels

labels = extract_labels(["a06-119", "a06-128"])
accuracy(labels, llm_output)

print(labels)
print(llm_output)

In [3]:
class LLM():
    def __init__(self, llm_model_name: str, device: str, ocr_output: list[str], dataframe: pd.DataFrame, torch_dtype: str ="auto"):
        self.llm_model_name = "Qwen/Qwen3-1.7B"

        # load the tokenizer and the model
        self.tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            llm_model_name,
            torch_dtype=torch_dtype,
            device_map="auto" if device == "cuda" else None
        )
        self.ocr_output = ocr_output
        self.dataframe = dataframe

        
    def combine_into_a_line(lines: object) -> str:
        line = ""
        for l in lines:
            line += f"{l} "
        if line[-1] == " ":         #remove the space added at the end of the line
            line = line[:-1]
        return line
    
    
    def accuracy(labels: list[str], llm_output: list[str]):
        if len(labels) != len(llm_output):
            raise ValueError(f"labels and llm_output must be of same size, received {len(labels)} labels and {len(llm_output)} output")
        
        for i in range(len(labels)):
            accuracy = Levenshtein.ratio(labels[i], llm_output[i]) * 100
            print("Accuracy =", accuracy, "%")

    def process(self):
        # prepare the model input
        llm_output = []
        for i in self.ocr_output:        
            prompt = "You are a text corrector. Only correct spelling and punctuation. Do not edit content. Do not rephrase. Only output the corrected text." \
                        f"Input text: {i}"
            messages = [
                {"role": "user", "content": prompt}
            ]
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
            )
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)

            # conduct text completion
            generated_ids = self.model.generate(
                **model_inputs,
                max_new_tokens=32768
            )
            output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
            output = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
            llm_output.append(output)
            
        return llm_output

    def extract_labels(text_number: list[str]):
        labels = []
        for i in text_number:
            text = df[df["text_number"] == i]
            lines = text["segment_text"]
            label = combine_into_a_line(lines)
            labels.append(label)
        return labels

In [4]:
llm_model_name = "Qwen/Qwen3-1.7B"
input_text = ["Note circalation soared for the sixth successive week - "\
"thir time by more than 15,000,000 last weet. "\
"And that brought the fiyure to a record 2,415,000,000." \
"This was 100,000,000 more than the corresponding week last year and 37,000,000 up on"\
"the 1960 record set last Christmus."\
"Now look at the other side of all"\
"these coins.", 
"Banks have paid in a first"\
"instalment of almost 8,000,000"\
"in respoonse to the Budgette appeal."\
"About another 70,000,000 is due"\
"by Setember 20. For nearly a year"\
"about 150.000,000 has been frozen."\
"MR. KRUSCHEV raises the bogy of"\
"German militarism in his replies to"\
"the West on Berrlin. And he repeats"\
"that the pro`blam ”must be solved"\
"this year.”"]


llm = LLM(llm_model_name, device, input_text, df)

llm_output = llm.process()

llm_output
