## Imports

In [105]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
import torchvision.transforms.functional as TF
import pandas as pd
from PIL import Image
import Levenshtein
from transformers import AutoModelForCausalLM, AutoTokenizer

import kagglehub

print("ok")

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'kagglehub'

In [106]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "xpu"
    if torch.xpu.is_available()
    else "cpu"
)

print(f"Using {device} device")

Using xpu device


## Parameters and Hyperparameters

In [107]:
char_set = ['\n', ' ', '!', '(', ')', '*', ',', '-', '.', '/',
             '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
             ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
             'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
             'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a',
             'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
             'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
             'v', 'w', 'x', 'y', 'z', '¿', '’', '“', '”', '„']

OCR_CONFIG = {'img_width': 1536,
              'img_height': 128,
              'max_out_len': 96,
              'char_set_size': 80,#len(char_set), # 80 + 1 for padding
              'drop_rate': .1,
              'batch_size': 1}

### Text encoding/decoding

In [108]:
def encode(text: str):
    encoded_text = []
    for char in text:
        encoded_text.append(char_set.index(char))
    return encoded_text

def decode(char_ids: list):
    decoded_text = ''
    for id in char_ids:
        if id != OCR_CONFIG['char_set_size']: # char_id 80 corresponds to the blank token
            decoded_text += char_set[id]
    return decoded_text

def pad_ids(char_ids: list, length: int):
    return char_ids
    list_len = len(char_ids)
    if list_len < length:
        char_ids += [OCR_CONFIG['char_set_size']-1] * (length - list_len)
    return char_ids

### Preparing the Dataset

In [109]:
#define nn datastructure
class OCR_dataset(Dataset):
    def __init__(self, df: pd.DataFrame, root_path: str):
        self.df = df
        self.root_path = root_path

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.root_path + self.df.iloc[idx]['file_path']
        start = self.df.iloc[idx]['segment_start']
        end = self.df.iloc[idx]['segment_end']
        text = self.df.iloc[idx]['segment_text']

        target = encode(text)
        target = pad_ids(target, OCR_CONFIG["max_out_len"])
        #target = torch.LongTensor(target)
        target = torch.tensor(target)

        image = Image.open(path)
        width, _ = image.size
        image = image.crop((0, start, width, end))
        image = image.resize((OCR_CONFIG['img_width'], OCR_CONFIG['img_height']))
        image = TF.to_tensor(image)
        return image, target

In [110]:
#root_path = "iam-handwritten-forms-dataset/versions/1/data"
root_path = "/home/jan/.cache/kagglehub/datasets/naderabdelghany/iam-handwritten-forms-dataset/versions/1/data"
num_workers = 0


df = pd.read_csv("segments.csv", delimiter="\t")
data = OCR_dataset(df, root_path)
generator = torch.Generator().manual_seed(299792458) # Generator for reproducability
train_data, eval_data, test_data = random_split(data, [.8, .1, .1], generator)

train_dataloader = DataLoader(
    dataset=train_data,
    batch_size=OCR_CONFIG['batch_size'],
    shuffle=True,
    num_workers=num_workers,
    drop_last = True
)
test_dataloader = DataLoader(
    dataset=eval_data,
    batch_size=OCR_CONFIG['batch_size'],
    shuffle=True,
    num_workers=num_workers,
    drop_last = True
)
eval_dataloader = DataLoader(
    dataset=test_data,
    batch_size=OCR_CONFIG['batch_size'],
    shuffle=True,
    num_workers=num_workers,
    drop_last = True
)

### The Model

In [111]:
#define nn layers
class OCR_neural_network(nn.Module):
    def __init__(self, CONFIG: dict):
        super().__init__()
        self.CONFIG = CONFIG
        self.rnn_height = CONFIG['img_height']//4
        self.rnn_width = CONFIG['img_width']//4
        self.rnn_feature_number = self.rnn_height * 64
        
        self.conv_pooling_stack = nn.Sequential(#  in:   1 * 128 * 1536
            nn.Conv2d(1, 32, 3, padding=1),     # out:  32 * 128 * 1536
            nn.ReLU(),
            nn.MaxPool2d(2),                    # out:  32 *  64 *  768
            nn.Conv2d(32, 64, 3, padding=1),    # out:  64 *  64 *  768
            nn.ReLU(),
            nn.MaxPool2d(2),                    # out:  64 *  32 *  384
            nn.Conv2d(64, 128, 3, padding=1),   # out: 128 *  32 *  384
            nn.ReLU(),
            nn.MaxPool2d(2),                    # out: 128 *  16 *  192
            nn.Conv2d(128, 256, 3, padding=1),  # out: 256 *  16 *  192
            nn.ReLU(),
            nn.MaxPool2d(2),                    # out: 256 *   8 *   96
        )

        self.linear_stack = nn.Sequential(
            nn.Linear(CONFIG['img_height']*16, 256),
            nn.ReLU(),
            nn.Dropout(CONFIG['drop_rate']),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(CONFIG['drop_rate'])
        )
            
        self.rnn1 = nn.LSTM(128, 128, batch_first=True, bidirectional=True, dropout=CONFIG['drop_rate'])
        self.rnn2 = nn.LSTM(256, 64, batch_first=True, bidirectional=True, dropout=CONFIG['drop_rate'])

        #self.output_layer = nn.Linear(128, CONFIG['char_set_size'])

        #trial
        self.output_layer1 = nn.Linear(96, 96)
        self.output_layer2 = nn.Linear(128, 81)

        
    #print(f'IN:\t\t{x.size()}')
    def forward(self, x):
        #  in:   1 * 128 * 1536

        x = self.conv_pooling_stack(x)
        # out: 256 *   8 *  96
    
        #reshape for rnn
        x = x.permute(0, 3, 1, 2)
        # out: 96 * 256 * 8
        x = torch.reshape(x, (self.CONFIG['batch_size'], 96, 2048))    
        # out: 96 * 2048
            
        x = self.linear_stack(x)
        # out: 96 * 128
    
        x, y = self.rnn1(x) #y is not used
        # out: 96 * 256
        x, y = self.rnn2(x)
        # out: 96 * 128
    
        x = x.permute(0, 2, 1)
        x = self.output_layer1(x)
        x = x.permute(0, 2, 1)
        x = self.output_layer2(x)
        # out: 96 * 81

        x = x.permute(0, 2, 1)

        return x



In [112]:
def pick_char(output_tensor):
    output_strings = []
    max_val_indices = torch.argmax(output_tensor, dim=1)
    for batch in max_val_indices:
        output_strings.append(decode(batch))
    
    return output_strings

In [113]:
#create instance of model
model = OCR_neural_network(OCR_CONFIG)
criterion = nn.CrossEntropyLoss(reduction="mean", ignore_index=80)
criterion = nn.CTCLoss(blank=80, reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
writer = SummaryWriter()
num_epochs = 5

total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

#train NN
global_step = 0
for epoch in range(num_epochs):
    print("epoch" + str(epoch))
    model.train()
    print("model in training mode")
    loss_list = []

    for image_data, label in train_dataloader:
        #print(label.shape)
        optimizer.zero_grad()
        outputs = model(image_data)
        #print(outputs.shape)
        #outputs = outputs.permute(0, 2, 1)

        print(pick_char(outputs))
        print([decode(lab) for lab in label])

        input_lengths = [len(out) for out in outputs]
        output_lengths = [len(lab) for lab in label]
        outputs = outputs.permute(2, 0, 1)
        loss = criterion(outputs, label, input_lengths, output_lengths)
        loss.backward()
        optimizer.step()

        #loss monitoring
        global_step += 1
        writer.add_scalar(tag='Loss/train', scalar_value=loss.item(), global_step=global_step)
        #if global_step % 10 == 1:
        print(str(global_step) + ": " + str(loss.item()))

Total number of parameters: 1,394,097
epoch0
model in training mode




['j333333Ymm33Ym33m333Ys33YY3333mYY3Ysm3mmmm3mEYmY3mm333333333Y3Y33333m3m3mY333m33Y3mYmm3m3Y3:mYs3']
['Villiers Graaff, have been returned unopposed.']
1: -1.3978908061981201
['s3333333mm33Ym33m333Ym33EY3333mEY3Esm3mmmm3mEYmY3mm333333333Y3Y33333m3m3mY333m33Y3mYmm3m3Y33mYs3']
['eyes there has sprung up a regular army led by former Nazi']
2: -1.0156714916229248
['E333333:mm33:m33m333Ys33EY3333mEY3EEm3mmmm3mEYYY3mm333333333Y3Y33333m3m3mE333m33Y3mYmm3m3Y33mYE3']
['the-Bomb crusade, has a devoted following.']
3: -1.7194952964782715
['E333333:Ym33:(33Y33?YE33:Y33:3m:Y3:E(3mmmm3mEYYY3m(3:3:33333:3:33333Y3m3YY333m33:3(Y(m3Y3Y3(m:E3']
['of life, and all facts are ”one”. We have all']
4: -1.5102437734603882
[':3(:(33:Y(3Y:(33:3(:::33::33:3Y::3::(3mYYY3(:::::((::3:33::3:::33(3::3m:Y:::3m:3:3(Y(m3Y3Y3((::(']
['brought a bark of „Start another war!“']
5: -1.8680968284606934
[':3(:(33::(3::(::::(::::::::3:3:::3::(3m:Y:3(:::::((::(:3:::3:::(3(3::3m::::::(:3:((Y(m(Y((3((::(']
['their balances, will ga

KeyboardInterrupt: 

In [None]:
#model testing

def pick_char(output_tensor):
    output_strings = []
    max_val_indices = torch.argmax(output_tensor, dim = 2)
    for batch in max_val_indices:
        output_strings.append(decode(batch))
    
    return output_strings
    
model.eval()
with torch.no_grad():
    for image_data, label in test_dataloader:
        outputs = model(image_data)
        #outputs = outputs.permute(0, 2, 1)

        print(pick_char(outputs))
        print(label)
        


## LLM 


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

llm_model_name = "Qwen/Qwen3-1.7B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
model = AutoModelForCausalLM.from_pretrained(
    llm_model_name,
    torch_dtype="auto",
    device_map=None
)



In [None]:
def combine_into_a_line(lines: object) -> str:
    line = ""
    for l in lines:
        line += f"{l} "
    if line[-1] == " ":         #remove the space added at the end of the line
        line = line[:-1]
    return line

In [None]:
# the input texts are "a06-119" and "a06-128" respectively
input_text = ["Note circalation soared for the sixth successive week - "\
"thir time by more than 15,000,000 last weet. "\
"And that brought the fiyure to a record 2,415,000,000." \
"This was 100,000,000 more than the corresponding week last year and 37,000,000 up on"\
"the 1960 record set last Christmus."\
"Now look at the other side of all"\
"these coins.", 
"Banks have paid in a first"\
"instalment of almost 8,000,000"\
"in respoonse to the Budgette appeal."\
"About another 70,000,000 is due"\
"by Setember 20. For nearly a year"\
"about 150.000,000 has been frozen."\
"MR. KRUSCHEV raises the bogy of"\
"German militarism in his replies to"\
"the West on Berrlin. And he repeats"\
"that the pro`blam ”must be solved"\
"this year.”"]

def llm_process(input_text: list[str]):
# prepare the model input
    llm_output = []
    for i in input_text:        
        prompt = "You are a text corrector. Only correct spelling and punctuation. Do not edit content. Do not rephrase. Only output the corrected text." \
                    f"Input text: {i}"
        messages = [
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
        )
        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

        # conduct text completion
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=32768
        )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
        output = tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
        llm_output.append(output)

        print("input:\n", i)
        print("output:\n", output)
    return llm_output

llm_output = llm_process(input_text)

In [None]:
import Levenshtein

def accuracy(labels: list[str], llm_output: list[str]):
    if len(labels) != len(llm_output):
        raise ValueError(f"labels and llm_output must be of same size, received {len(labels)} labels and {len(llm_output)} output")
    
    for i in range(len(labels)):
        accuracy = Levenshtein.ratio(labels[i], llm_output[i]) * 100
        print("Accuracy =", accuracy, "%")


In [None]:
def extract_labels(text_number: list[str]):
    labels = []
    for i in text_number:
        text = df[df["text_number"] == i]
        lines = text["segment_text"]
        label = combine_into_a_line(lines)
        labels.append(label)
    return labels

labels = extract_labels(["a06-119", "a06-128"])
accuracy(labels, llm_output)

print(labels)
print(llm_output)

In [3]:
class LLM():
    def __init__(self, llm_model_name: str, device: str, ocr_output: list[str], dataframe: pd.DataFrame, torch_dtype: str ="auto"):
        self.llm_model_name = "Qwen/Qwen3-1.7B"

        # load the tokenizer and the model
        self.tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            llm_model_name,
            torch_dtype=torch_dtype,
            device_map="auto" if device == "cuda" else None
        )
        self.ocr_output = ocr_output
        self.dataframe = dataframe

        
    def combine_into_a_line(lines: object) -> str:
        line = ""
        for l in lines:
            line += f"{l} "
        if line[-1] == " ":         #remove the space added at the end of the line
            line = line[:-1]
        return line
    
    
    def accuracy(labels: list[str], llm_output: list[str]):
        if len(labels) != len(llm_output):
            raise ValueError(f"labels and llm_output must be of same size, received {len(labels)} labels and {len(llm_output)} output")
        
        for i in range(len(labels)):
            accuracy = Levenshtein.ratio(labels[i], llm_output[i]) * 100
            print("Accuracy =", accuracy, "%")

    def process(self):
        # prepare the model input
        llm_output = []
        for i in self.ocr_output:        
            prompt = "You are a text corrector. Only correct spelling and punctuation. Do not edit content. Do not rephrase. Only output the corrected text." \
                        f"Input text: {i}"
            messages = [
                {"role": "user", "content": prompt}
            ]
            text = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,
                enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
            )
            model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)

            # conduct text completion
            generated_ids = self.model.generate(
                **model_inputs,
                max_new_tokens=32768
            )
            output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
            output = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
            llm_output.append(output)
            
        return llm_output

    def extract_labels(text_number: list[str]):
        labels = []
        for i in text_number:
            text = df[df["text_number"] == i]
            lines = text["segment_text"]
            label = combine_into_a_line(lines)
            labels.append(label)
        return labels

In [4]:
llm_model_name = "Qwen/Qwen3-1.7B"
input_text = ["Note circalation soared for the sixth successive week - "\
"thir time by more than 15,000,000 last weet. "\
"And that brought the fiyure to a record 2,415,000,000." \
"This was 100,000,000 more than the corresponding week last year and 37,000,000 up on"\
"the 1960 record set last Christmus."\
"Now look at the other side of all"\
"these coins.", 
"Banks have paid in a first"\
"instalment of almost 8,000,000"\
"in respoonse to the Budgette appeal."\
"About another 70,000,000 is due"\
"by Setember 20. For nearly a year"\
"about 150.000,000 has been frozen."\
"MR. KRUSCHEV raises the bogy of"\
"German militarism in his replies to"\
"the West on Berrlin. And he repeats"\
"that the pro`blam ”must be solved"\
"this year.”"]


llm = LLM(llm_model_name, device, input_text, df)

llm_output = llm.process()

llm_output
