In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments,AutoModelWithLMHead, TextDataset,DataCollatorForLanguageModeling
from sklearn.model_selection import train_test_split
import pandas as pd
import datasets as ds
import torch

data = pd.read_csv("../data/all_conversational_pairs.csv")
train, test = train_test_split(data, test_size=0.2, random_state=1)
train, eval = train_test_split(train, test_size=0.2, random_state=1)

train.to_csv("../data/gpt/train.csv", index=False)
eval.to_csv("../data/gpt/eval.csv", index=False)
test.to_csv("../data/gpt/test.csv", index=False)

tokenizer = AutoTokenizer.from_pretrained("cjvt/gpt-sl-base")

model = AutoModelForCausalLM.from_pretrained("cjvt/gpt-sl-base")

In [2]:
from transformers import pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
prompt = "Ali je Ljubljana je glavno mesto Slovenije."
generator(prompt, max_length=100, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Ali je Ljubljana je glavno mesto Slovenije. - v skladu z zakonom o javnih naročilih (ZJN-2, Uradni list RS, št. 39/00, 102/00,  2/04, v nadaljevanju: ZJN-2), - v skladu z zakonom o gospodarskih družbah (ZGD-1), - v skladu z zakonom o javnih'}]

In [3]:
def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,eval_dataset,data_collator = load_dataset("../data/gpt/train.csv","../data/gpt/eval.csv",tokenizer)



In [4]:
training_args = TrainingArguments(
    output_dir="../models/gpt-ft", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=8, # batch size for training
    per_device_eval_batch_size=16,  # batch size for evaluation
    eval_steps = 400, # Number of update steps between two evaluations.
    save_steps = 800, # after # steps model is saved 
    warmup_steps = 500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    save_best_model=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [5]:
trainer.train()



  0%|          | 0/36885 [00:00<?, ?it/s]

{'loss': 2.9647, 'learning_rate': 5e-05, 'epoch': 0.07}
{'loss': 2.842, 'learning_rate': 4.931290366909441e-05, 'epoch': 0.14}
{'loss': 2.7902, 'learning_rate': 4.862580733818882e-05, 'epoch': 0.2}
{'loss': 2.739, 'learning_rate': 4.793871100728322e-05, 'epoch': 0.27}
{'loss': 2.7061, 'learning_rate': 4.725161467637763e-05, 'epoch': 0.34}
{'loss': 2.6652, 'learning_rate': 4.656451834547204e-05, 'epoch': 0.41}
{'loss': 2.655, 'learning_rate': 4.587742201456645e-05, 'epoch': 0.47}
{'loss': 2.6362, 'learning_rate': 4.519032568366085e-05, 'epoch': 0.54}
{'loss': 2.5923, 'learning_rate': 4.450322935275526e-05, 'epoch': 0.61}
{'loss': 2.6014, 'learning_rate': 4.381613302184967e-05, 'epoch': 0.68}
{'loss': 2.5698, 'learning_rate': 4.312903669094407e-05, 'epoch': 0.75}
{'loss': 2.5492, 'learning_rate': 4.2441940360038477e-05, 'epoch': 0.81}
{'loss': 2.5491, 'learning_rate': 4.175484402913289e-05, 'epoch': 0.88}
{'loss': 2.5182, 'learning_rate': 4.106774769822729e-05, 'epoch': 0.95}
{'loss': 2.

TrainOutput(global_step=36885, training_loss=1.9642753593834723, metrics={'train_runtime': 6622.7923, 'train_samples_per_second': 44.554, 'train_steps_per_second': 5.569, 'train_loss': 1.9642753593834723, 'epoch': 5.0})

In [6]:
trainer.save_model()

In [None]:
del tokenizer
del model
del generator
torch.cuda.empty_cache()

In [25]:
tokenizer = AutoTokenizer.from_pretrained("cjvt/gpt-sl-base")

model = AutoModelForCausalLM.from_pretrained("..\models\gpt-ft")

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
prompt = "Ali je Ljubljana glavno mesto Slovenije?"
generator(prompt, max_length=50, num_return_sequences=1)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Ali je Ljubljana glavno mesto Slovenije?,"Dunaj je glavno mesto Velike Britanije, ki je znano po svoji bogati zgodovini, kulturi, gastronomiji in nočnem življenju. Mesto je znano tudi po svoji umetniški in kulturni dediščini'}]