# Greek GPT-2 Fine-tuning

In [None]:
!pip install datasets==3.6.0
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git@main
!pip install evaluate jiwer

## Load Speech Dataset

In [None]:
from datasets import load_dataset, IterableDatasetDict
import os
from datasets import Audio
from datasets import concatenate_datasets


os.environ["CUDA_VISIBLE_DEVICES"] = "0"
language = "Greek"
language_abbr = "el"
language_abbr2 = "el_gr"
task = "transcribe"


a = IterableDatasetDict()
b = IterableDatasetDict()
c = IterableDatasetDict()


a_full = load_dataset("Vardis/Greek_Mosel", split="train")
a_temp = a_full.train_test_split(test_size=0.2, seed=42)  # 80% train 
a_val_test = a_temp["test"].train_test_split(test_size=0.5, seed=42)  # 10% val + 10% test
a["train"] = a_temp["train"]
a["validation"] = a_val_test["train"]
a["test"] = a_val_test["test"]

b_full = load_dataset("mozilla-foundation/common_voice_11_0", language_abbr, split="train+validation+test")
b_temp = b_full.train_test_split(test_size=0.2, seed=42)
b_val_test = b_temp["test"].train_test_split(test_size=0.5, seed=42)
b["train"] = b_temp["train"]
b["validation"] = b_val_test["train"]
b["test"] = b_val_test["test"]

c_full = load_dataset("google/fleurs", language_abbr2, split="train+validation+test")
c_temp = c_full.train_test_split(test_size=0.2, seed=42)
c_val_test = c_temp["test"].train_test_split(test_size=0.5, seed=42)
c["train"] = c_temp["train"]
c["validation"] = c_val_test["train"]
c["test"] = c_val_test["test"]

In [19]:
b = b.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
c = c.remove_columns(["id", "num_samples", "path", "raw_transcription", "gender", "lang_id", "language", "lang_group_id"])

a = a.rename_column("text", "sentence")
c = c.rename_column("transcription", "sentence")

print(a)
print(b)
print(c)

a = a.cast_column("audio", Audio(sampling_rate=16000))
b = b.cast_column("audio", Audio(sampling_rate=16000))
c = c.cast_column("audio", Audio(sampling_rate=16000))

combined_train = concatenate_datasets([a['train'], b['train'], c['train']])
combined_test = concatenate_datasets([a['test'], b['test'], c['test']])
combined_valid = concatenate_datasets([a['validation'], b['validation'], c['validation']])

combined_dataset = IterableDatasetDict({
    'train': combined_train,
    "validation": combined_valid,
    'test': combined_test
})

dataset = combined_dataset
print(dataset)

IterableDatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 3100
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 388
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 388
    })
})
IterableDatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4248
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 531
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 532
    })
})
IterableDatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 3308
    })
    validation: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 414
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 414
    })
})
IterableDatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
       

## Medical Dataset

In [15]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset("Vardis/Greek_Medical_Text")

# split into train+validation and test
split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)

# split the train set into train and validation
train_valid_split = split_dataset["train"].train_test_split(test_size=0.1, seed=42)

medical_dataset = DatasetDict({
    "train": train_valid_split["train"],
    "validation": train_valid_split["test"],
    "test": split_dataset["test"]
})

print(medical_dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 16548
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1839
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2043
    })
})


## Merge and Shuffle Text Datasets

This step combines all sentences/texts from ds1 and ds2, shuffles the combined dataset, and splits it into final training, validation, and test sets.


In [58]:
from datasets import Dataset, DatasetDict, concatenate_datasets

all_sentences = []
for split in ["train", "validation", "test"]:
    part = dataset[split].to_dataset() if hasattr(dataset[split], "to_dataset") else dataset[split]
    all_sentences.extend(part["sentence"])

all_texts = []
for split in ["train", "validation", "test"]:
    all_texts.extend(medical_dataset[split]["text"])

dataset_all = Dataset.from_dict({"sentence": all_sentences})
medical_dataset_all = Dataset.from_dict({"text": all_texts})

final_all = concatenate_datasets([dataset_all, medical_dataset_all]).shuffle(seed=42)

train_test = final_all.train_test_split(test_size=0.06, seed=42)
val_test = train_test["test"].train_test_split(test_size=0.7, seed=42)

final_ds = DatasetDict({
    "train": train_test["train"],
    "validation": val_test["train"],
    "test": val_test["test"]
})

print(final_ds)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 31727
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 607
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1419
    })
})

## Tokenize Dataset 

This step loads the Greek GPT-2 tokenizer, sets the padding token, and tokenizes all texts in final_ds with truncation and padding. Labels are created as a copy of the input IDs for language modeling.


In [59]:
from transformers import AutoTokenizer, EvalPrediction

tokenizer = AutoTokenizer.from_pretrained("lighteternal/gpt2-finetuned-greek")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    tokens = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = final_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/31727 [00:00<?, ? examples/s]

Map:   0%|          | 0/607 [00:00<?, ? examples/s]

Map:   0%|          | 0/1419 [00:00<?, ? examples/s]

## Apply LoRA to GPT-2 

The Greek GPT-2 model is loaded and moved to the GPU if available. LoRA is applied to the attention and projection layers to enable parameter-efficient fine-tuning.


In [87]:
import torch
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel

# Load the base GPT-2 model
model = AutoModelForCausalLM.from_pretrained("lighteternal/gpt2-finetuned-greek")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Define LoRA Config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap the model with LoRA configuration
peft_model = get_peft_model(model, lora_config)

peft_model.print_trainable_parameters()

trainable params: 1,622,016 || all params: 126,061,824 || trainable%: 1.2867




## Compute Perplexity

This function calculates the average loss and perplexity on evaluation data.

In [88]:
import math
from transformers import AutoTokenizer, AutoModelForCausalLM, EvalPrediction


def compute_metrics(eval_pred: EvalPrediction):
    label_ids = eval_pred.label_ids.astype(int)

    label_ids[label_ids == -100] = tokenizer.pad_token_id or tokenizer.eos_token_id

    decoded_labels = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    total_loss = 0.0
    total_tokens = 0

    for ref_text in decoded_labels:
        inputs = tokenizer(ref_text, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}
            model.to('cuda')
        with torch.no_grad():
            outputs = peft_model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss.item()
        total_loss += loss * inputs["input_ids"].size(1)
        total_tokens += inputs["input_ids"].size(1)

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)

    return {"perplexity": perplexity}


## Training Setup

In [92]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./gpt2-Greek-Medical",
    num_train_epochs=30,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    include_inputs_for_metrics=True,
    fp16=True,
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="perplexity",
    greater_is_better=False,
    report_to="none"
)

# The data collator will handle batching and padding
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'].select(range(60)), # we used a smaller subset to fit GPU memory
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using `include_inputs_for_metrics` is deprecated and will be removed in version 5 of 🤗 Transformers. Please use `include_for_metrics` list argument instead.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Training

In [93]:
trainer.train()

Epoch,Training Loss,Validation Loss,Perplexity
1,3.9546,4.202718,44.991269
2,3.953,4.167151,43.984107
3,3.9277,4.142826,43.114071
4,3.9255,4.134766,42.543837
5,3.898,4.117559,42.029292
6,3.8854,4.104939,41.571901
7,3.8607,4.090889,41.216248
8,3.8591,4.071882,40.839166
9,3.8208,4.070119,40.514361
10,3.8164,4.058827,40.221701


TrainOutput(global_step=29760, training_loss=3.8237653009353147, metrics={'train_runtime': 24263.8716, 'train_samples_per_second': 39.227, 'train_steps_per_second': 1.227, 'total_flos': 1.2672153970016256e+17, 'train_loss': 3.8237653009353147, 'epoch': 30.0})

## Push Model and Tokenizer to Hugging Face Hub


In [94]:
peft_model.push_to_hub("Vardis/Medical_Speech_Greek_GPT2", token="################")
tokenizer.push_to_hub("Vardis/Medical_Speech_Greek_GPT2", token="################")

Uploading...:   0%|          | 0.00/6.50M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Vardis/Medical_Speech_Greek_GPT2/commit/b419f7da5adbed1b244d05962881c05b003ca251', commit_message='Upload tokenizer', commit_description='', oid='b419f7da5adbed1b244d05962881c05b003ca251', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Vardis/Medical_Speech_Greek_GPT2', endpoint='https://huggingface.co', repo_type='model', repo_id='Vardis/Medical_Speech_Greek_GPT2'), pr_revision=None, pr_num=None)