# **Preparation**

In [None]:
!pip install datasets sentencepiece accelerate bitsandbytes evaluate xformers deepspeed
!pip install git+https://github.com/huggingface/transformers.git@main

In [None]:
!nvidia-smi

# **Fine tuning**

In [None]:
!pip install wandb

import wandb
wandb.login()
wandb.init(project="opisy")

%env WANDB_PROJECT=develop

In [None]:
import wandb

wandb.login()
run = wandb.init(project="opisy")
artifact = run.use_artifact('sovern-development/opisy/descriptions:latest', type='dataset')
artifact_dir = artifact.download()

In [None]:
import torch
import transformers
from torch import nn
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoTokenizer, AutoConfig, AutoModelForCausalLM
from datasets import load_dataset

**Load model and tokenizer**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = "sdadas/polish-gpt2-small"  #Use small (176M params) model for testing on colab

model = AutoModelForCausalLM.from_pretrained(base_model, device_map='auto', use_cache=False)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(base_model)
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

**Pre-process data**

In [None]:
def generate(data_point):
    return {
         'text': [
            f"Produkt:\n{name}, {category}\n\n###\n\n{description}[END]"
            for name, category, description in zip(data_point['name'], data_point['category'], data_point['description'])
        ]
    }

def tokenize(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=2048)

dataset = load_dataset("csv", data_files='/content/descriptions.csv')
dataset = dataset.map(generate, batched=True)

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=['name', 'category', 'description', 'code', 'brand', 'text'])
tokenized_dataset = tokenized_dataset.with_format("torch")
split_datasets = tokenized_dataset['train'].train_test_split(test_size=0.1, seed=42, shuffle=True)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
# train_dataloader = DataLoader(
#     split_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator
# )
# test_dataloader = DataLoader(
#     split_datasets["test"], batch_size=16, shuffle=True, collate_fn=data_collator
# )


**Finetuning**

In [None]:
import os
import transformers
from transformers import TrainingArguments
from datasets import load_metric

output_dir = "/content/portia-ai"

training_args = TrainingArguments(
      #paths
      output_dir=output_dir,
      overwrite_output_dir=True,
      #optimization
      per_device_train_batch_size=4,
      gradient_accumulation_steps=16,
      gradient_checkpointing=True,
      #training params
      num_train_epochs=3,
      learning_rate=5e-3,
      optim="adamw_torch",
      fp16=True,
      warmup_steps=400,
      #smart batching
      group_by_length=True,
      #eval
      # load_best_model_at_end=True,
      evaluation_strategy="steps",
      eval_steps=200,
      # save_strategy="steps",
      # save_steps=1000,
      #wandb
      report_to="wandb",
      run_name="develop",
)

trainer = transformers.Trainer(model=model,
            train_dataset=split_datasets['train'],
            eval_dataset=split_datasets['test'],
            args=training_args,
            data_collator=data_collator
)

result = trainer.train(resume_from_checkpoint=False)
model.save_pretrained(output_dir)

wandb.finish()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Inference**

In [None]:
model_path = "/content/portia-ai"

def generate_text(sequence, max_length):
    inference_model = AutoModelForCausalLM.from_pretrained(model_path)
    inference_model.to(device)
    inference_model.eval()

    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    # if device is "cuda":
    ids = ids.cuda()

    final_outputs = inference_model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=inference_model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

generate_text("Napisz reklamę jednego produktu.\n\n<Nazwa>\nLaptop Acer\n\n<Reklama>\n", 2048)