In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import json

In [3]:
with open('/kaggle/input/my-own-dataset/data.json', 'r') as file:
    data = json.load(file)

In [45]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset

model = AutoModelForSeq2SeqLM.from_pretrained("/kaggle/working/fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("MIIB-NLP/Arabic-question-generation")

In [30]:
dataset = Dataset.from_list(data)

In [31]:
dataset = dataset.shuffle(seed=42).select(range(10000))

In [32]:
def tokenize(example):
    input_text = "context: " + example['context'][0] + " answer: " + example['answer'][0] + " </s>"
    target_text = example['question']
    
    # Tokenize input text
    input_encoding = tokenizer(input_text, padding='max_length', truncation=True, max_length=512)
    input_ids = input_encoding['input_ids']
    attention_mask = input_encoding['attention_mask']
    
    # Tokenize target text
    target_encoding = tokenizer(target_text, padding='max_length', truncation=True, max_length=512)
    target_ids = target_encoding['input_ids']
    
    # Prepare decoder input IDs
    # For auto-regressive models, like GPT, the decoder input is the target sequence shifted by one
    decoder_input_ids = target_ids[:-1]  # Remove last token, as it should not be fed into the model
    
    # Return the tokenized inputs as a dictionary
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'decoder_input_ids': decoder_input_ids,  # Specify decoder input IDs
        'labels': target_ids[1:],  # Shifted target IDs for computing loss
    }

# Tokenize the dataset
dataset = dataset.map(tokenize, batch_size=len(dataset))

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    save_strategy="no" 
)



# Create the Trainer and train
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset,               # training dataset
)

trainer.train()

# Save the model
# model.save_pretrained('./fine_tuned_model')

In [34]:
model.save_pretrained('./fine_tuned_model')

In [46]:
def get_question(context,answer):
  text="context: " +context + " " + "answer: " + answer + " </s>"
  text_encoding = tokenizer.encode_plus(
      text,return_tensors="pt"
  )
  model.eval()
  generated_ids =  model.generate(
    input_ids=text_encoding['input_ids'],
    attention_mask=text_encoding['attention_mask'],
    max_length=64,
    num_beams=5,
    num_return_sequences=1
  )
  return tokenizer.decode(generated_ids[0],skip_special_tokens=True,clean_up_tokenization_spaces=True).replace('question: ',' ')

In [48]:
context="الثورة الجزائرية أو ثورة المليون شهيد، اندلعت في 1 نوفمبر 1954 ضد المستعمر الفرنسي ودامت 7 سنوات ونصف. استشهد فيها أكثر من مليون ونصف مليون جزائري"
answer =" 7 سنوات"

get_question(context,answer)

''

In [37]:
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(110080, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(110080, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo