In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling, AutoConfig
from sklearn.model_selection import train_test_split
import pandas as pd
import datasets as ds
import torch
import re
import nltk
import numpy as np

data = pd.read_csv("../../data/processed/prompt_reply_pairs.csv", sep=";")

train_data = data[data["split"] == "train"]
val_data = data[data["split"] == "val"]
test_data = data[data["split"] == "test"]

train_data = ds.Dataset.from_pandas(train_data[['prompt', 'reply']])
val_data = ds.Dataset.from_pandas(val_data[['prompt', 'reply']])
test_data = ds.Dataset.from_pandas(test_data[['prompt', 'reply']])

tokenizer = AutoTokenizer.from_pretrained("cjvt/gpt-sl-base", truncation=True, truncation_side='left')

model = AutoModelForCausalLM.from_pretrained("cjvt/gpt-sl-base")

In [None]:
def convert_to_features(examples):
    prefix_in = "Uporabnik: "
    # prefix_in = ""
    examples["prompt"] = [prefix_in + prompt for prompt in examples["prompt"]]
    # prefix_out = "Asistent: "
    prefix_out = ""
    examples["reply"] = [prefix_out + reply for reply in examples["reply"]]
    
    model_inputs = tokenizer(examples['prompt'], pad_to_max_length=True, max_length=512, truncation=True, return_tensors='pt')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['reply'], pad_to_max_length=True, max_length=128, truncation=True, return_tensors='pt')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train_data = train_data.map(convert_to_features, batched=True, load_from_cache_file=False)
val_data = val_data.map(convert_to_features, batched=True, load_from_cache_file=False)
test_data = test_data.map(convert_to_features, batched=True, load_from_cache_file=False)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = ds.metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
training_args = TrainingArguments(
    output_dir="../models/gpt-ft-3", #The output directory
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    push_to_hub=False, 
    fp16=True,
    load_best_model_at_end=True # load best model at end so we save the best model instead of the last model
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

In [None]:
del tokenizer
del model
torch.cuda.empty_cache()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cjvt/gpt-sl-base")

model = AutoModelForCausalLM.from_pretrained("..\..\models\gpt-ft-2")

In [None]:
test_data = pd.read_csv("../../data/gpt/test_2.csv").head(n = 50)

In [None]:
sens = test_data["prompt"].tolist()

In [None]:
prompts = tokenizer(sens, return_tensors='pt', padding=True, truncation=True)

In [None]:
with torch.no_grad():
  gen_tokens = model.generate(
      **prompts,
      do_sample=False,
      max_new_tokens=50,
  )
gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
gen_text = [re.sub('[^a-zA-Z0-9čšž\ \.!?,]+', '', _) for _ in gen_text]

In [None]:
test_data["generated"] = gen_text
test_data.to_csv("../../data/gpt_results/test_with_generated_2.csv", index=False)

In [None]:
test_data["prompt"][10]

In [None]:
test_data["generated"][10]