In [None]:
! if [ ! $pip_done ]; then pip install -q transformers ;fi 
! if [ ! $pip_done ]; then pip install -q datasets jiwer ;fi 
! if [ ! $pip_done ]; then pip install -q sentencepiece ;fi 

pip_done = 1

In [None]:
!git clone https://github.com/OmarMoMorgan/Arabic_scene_text.git

In [None]:
import torch
import pandas as pd
import numpy as np
from PIL import Image
from tqdm.notebook import tqdm
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, AdamW
from sklearn.model_selection import train_test_split
from torch import nn , optim


import sys
sys.path.append('/kaggle/working/Arabic_scene_text') #change this to the name of the repo
from models import build_model , KAN
from tools import EarlyStopping, tune_model , replace_specific_layers , generate_text_with_caption
from data import train_test_split_ , perPixel_mean_std, perChannel_mean_std, build_transforms

In [None]:
root_dir = "/kaggle/input/str-arabic-dataset/Arabic_words_train"
column_names = ['image_path', 'text']
df = pd.read_csv("/kaggle/input/str-arabic-dataset/Arabic_words_train/gt.txt", nrows = 4186, names = column_names)


test_size = 0.2
train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)

test_df , val_df = train_test_split(test_df, test_size=0.5, random_state=42)



In [None]:
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [None]:
train_df.shape, val_df,test_df.shape

Making the data loader here 

In [None]:
class ArabicSTRDataset(Dataset):
    def __init__(self, root_dir, df, processor, tokenizer, max_target_length):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.tokenizer = tokenizer
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get file name and text
        file_name = self.df.iloc[idx]['image_path']
        text = self.df.iloc[idx]['text']

        # Prepare image (resize and normalize)
        image_path = f"{self.root_dir}/{file_name}"
        image = Image.open(image_path).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values

        # Encode the text
        labels = self.tokenizer(text, padding="max_length", max_length=self.max_target_length, return_tensors="pt").input_ids
        labels = labels.squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100


        encoding = {"pixel_values": pixel_values.squeeze(), "labels": labels}
        return encoding

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

Making the pretrained model using gpt 2 as tokenizer and processor and for the backbone of the network we are using vision transormer with Deit archietcures and pretrained weights 

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-stage1")
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-stage1')
tokenizer_ = processor.tokenizer
model.to(device)

Here we will modify the model so that we can replace them with KANs

In [None]:
in1 = 256
in2 = 1024
out1 = 1024
out2 = 256
replace_specific_layers(model.decoder, 'fc1',in1, out1,KAN)
replace_specific_layers(model.decoder, 'fc2',in2, out2,KAN)


Paramters set for processor 

In [None]:
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 512
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id

batch_size = 8

In [None]:
train_dataset = ArabicSTRDataset(root_dir=root_dir,
                           df=train_df,
                           processor=processor,
                           tokenizer=processor.tokenizer,
                           max_target_length=100)

eval_dataset = ArabicSTRDataset(root_dir=root_dir,
                           df=val_df,
                           processor=processor,
                           tokenizer=processor.tokenizer,
                           max_target_length=100)

test_dataset = ArabicSTRDataset(root_dir=root_dir,
                           df=test_df,
                           processor=processor,
                           tokenizer=processor.tokenizer,
                           max_target_length=100)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min',patience = 5,factor = 0.1,verbose=True)
earlystopping = EarlyStopping()

In [None]:
hist_ = tune_model(100,model,train_dataloader,eval_dataloader,\
               optimizer,device,tokenizer_ , scheduler,earlystopping=earlystopping)

In [None]:
#!rm -rf /kaggle/working/*