In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset,DataCollatorForLanguageModeling, AutoConfig
from sklearn.model_selection import train_test_split
import pandas as pd
import datasets as ds
import torch
import re

data = pd.read_csv("../data/processed/context_reply_pairs.csv", sep=";")

train = data[data['split'] == 'train']
test = data[data['split'] == 'test']
val = data[data['split'] == 'val']

train.to_csv("../data/gpt/train_2.csv", index=False)
test.to_csv("../data/gpt/test_2.csv", index=False)
val.to_csv("../data/gpt/val_2.csv", index=False)

tokenizer = AutoTokenizer.from_pretrained("cjvt/gpt-sl-base", truncation=True, truncation_side='left')

model = AutoModelForCausalLM.from_pretrained("cjvt/gpt-sl-base")

In [12]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset, eval_dataset, data_collator = load_dataset("../data/gpt/train_2.csv","../data/gpt/eval_2.csv", tokenizer)

In [15]:
training_args = TrainingArguments(
    output_dir="../models/gpt-ft-2", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps = 800, # after # steps model is saved 
    warmup_steps = 500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    load_best_model_at_end=True,
    save_strategy = "no"
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [16]:
trainer.train()



  0%|          | 0/16980 [00:00<?, ?it/s]

{'loss': 2.7504, 'learning_rate': 5e-05, 'epoch': 0.03}
{'loss': 2.5868, 'learning_rate': 4.8483009708737866e-05, 'epoch': 0.06}
{'loss': 2.53, 'learning_rate': 4.696601941747573e-05, 'epoch': 0.09}
{'loss': 2.4703, 'learning_rate': 4.544902912621359e-05, 'epoch': 0.12}
{'loss': 2.4347, 'learning_rate': 4.393203883495146e-05, 'epoch': 0.15}
{'loss': 2.3789, 'learning_rate': 4.2415048543689325e-05, 'epoch': 0.18}
{'loss': 2.3514, 'learning_rate': 4.089805825242719e-05, 'epoch': 0.21}
{'loss': 2.3267, 'learning_rate': 3.938106796116505e-05, 'epoch': 0.24}
{'loss': 2.2705, 'learning_rate': 3.7864077669902914e-05, 'epoch': 0.27}
{'loss': 2.2519, 'learning_rate': 3.634708737864078e-05, 'epoch': 0.29}
{'loss': 2.2323, 'learning_rate': 3.483009708737864e-05, 'epoch': 0.32}
{'loss': 2.2026, 'learning_rate': 3.3313106796116504e-05, 'epoch': 0.35}
{'loss': 2.1608, 'learning_rate': 3.1796116504854373e-05, 'epoch': 0.38}
{'loss': 2.1502, 'learning_rate': 3.0279126213592237e-05, 'epoch': 0.41}
{'lo

TrainOutput(global_step=16980, training_loss=2.1425422551634736, metrics={'train_runtime': 2798.3903, 'train_samples_per_second': 48.54, 'train_steps_per_second': 6.068, 'train_loss': 2.1425422551634736, 'epoch': 1.0})

In [17]:
trainer.save_model()

In [18]:
del tokenizer
del model
torch.cuda.empty_cache()

In [21]:
tokenizer = AutoTokenizer.from_pretrained("cjvt/gpt-sl-base")

model = AutoModelForCausalLM.from_pretrained("..\models\gpt-ft-2")

In [22]:
test_data = pd.read_csv("../data/gpt/test.csv").head(n = 50)

In [23]:
sens = test_data["prompt"].tolist()

In [24]:
prompts = tokenizer(sens, return_tensors='pt', padding=True, truncation=True)

In [36]:
with torch.no_grad():
  gen_tokens = model.generate(
      **prompts,
      do_sample=False,
      max_new_tokens=50,
  )
gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
gen_text = [re.sub('[^a-zA-Z0-9čšž\ \.!?,]+', '', _) for _ in gen_text]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [37]:
test_data["generated"] = gen_text
test_data.to_csv("../data/gpt_results/test_with_generated_2.csv", index=False)

In [44]:
test_data["prompt"][10]

'Povej mi več o neumnem, neumnem Elonu Musku in njegovih neumnih dogodivščinah nakupa Twitterja.'

In [45]:
test_data["generated"][10]

' Povej mi več o neumnem, neumnem Elonu Musku in njegovih neumnih dogodivščinah nakupa Twitterja.'