In [2]:
from datasets import load_dataset
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
dataset = load_dataset("heliosbrahma/mental_health_chatbot_dataset")

In [3]:
from datasets import Dataset

def format_for_gpt2(example):
    return {"text": example["text"] + "\n\n"}

dataset = dataset.map(format_for_gpt2)

In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_name = "openai-community/gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name, token=os.getenv("HUGGING_FACE_TOKEN"))
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name,token=os.getenv("HUGGING_FACE_TOKEN"))

In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [14]:
from transformers import Trainer, TrainingArguments
import transformers
print(transformers.__version__)
def add_labels(example):
    example["labels"] = example["input_ids"].copy()
    return example

tokenized_datasets = tokenized_datasets.map(add_labels)

training_args = TrainingArguments(
    output_dir="../models/gpt2_mental_health_ft",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=20,
    learning_rate=3e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=True,

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
)

trainer.train()

4.57.1


Map: 100%|██████████| 172/172 [00:00<00:00, 4081.47 examples/s]


Step,Training Loss
20,1.6145
40,1.5463
60,1.4709
80,1.4432
100,1.3487




TrainOutput(global_step=110, training_loss=1.4826373447071421, metrics={'train_runtime': 548.0287, 'train_samples_per_second': 1.569, 'train_steps_per_second': 0.201, 'total_flos': 112355573760000.0, 'train_loss': 1.4826373447071421, 'epoch': 5.0})

In [15]:
trainer.save_model("./models/gpt2_mental_health_ft")
tokenizer.save_pretrained("./models/gpt2_mental_health_ft")

('./models/gpt2_mental_health_ft\\tokenizer_config.json',
 './models/gpt2_mental_health_ft\\special_tokens_map.json',
 './models/gpt2_mental_health_ft\\vocab.json',
 './models/gpt2_mental_health_ft\\merges.txt',
 './models/gpt2_mental_health_ft\\added_tokens.json')