In [None]:
!pip install transformers datasets torch

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Load the dataset as a single Dataset (not DatasetDict)
dataset = load_dataset("json", data_files="datasets/dataset.jsonl", split="train")

# Now split manually using the 'split' column inside your .jsonl data
train_dataset = dataset.filter(lambda x: x['split'] == 'train')
test_dataset = dataset.filter(lambda x: x['split'] == 'test')

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)

# Set the EOS token as the padding token
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    inputs['labels'] = inputs['input_ids'].copy()
    return inputs

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='results/',
    num_train_epochs=50,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='Logs/'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# save the model and tokenizer explicitly
model_output_dir = 'models/testGpt2'

model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)

In [None]:
from transformers import pipeline

qa_model = pipeline("text-generation", model="models/testGpt2", tokenizer="gpt2")

prompt = "Question: Who built the Vidhana Souda?\nAnswer:"
result = qa_model(prompt, max_new_tokens=50)
print(result[0]["generated_text"])