# **Preparation**

In [None]:
!pip install datasets sentencepiece accelerate bitsandbytes evaluate xformers
!pip install git+https://github.com/huggingface/transformers.git@main

In [None]:
!pip install wandb

import wandb
wandb.login()

%env WANDB_PROJECT=opisy

In [None]:
!nvidia-smi

# **Fine tuning**

In [None]:
import torch
import transformers
from torch import nn
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoTokenizer, AutoConfig, AutoModelForCausalLM
from datasets import load_dataset

**Load model and tokenizer**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = "ai-forever/mGPT"

model = AutoModelForCausalLM.from_pretrained(base_model, device_map='auto', use_cache=False)
model.to(device)

tokenizer = AutoTokenizer.from_pretrained(base_model) 
if tokenizer.pad_token_id is None:
  tokenizer.pad_token_id = tokenizer.eos_token_id

Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.45G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

**Pre-process data**

In [None]:
import wandb

wandb.login()
run = wandb.init(project="opisy")
artifact = run.use_artifact('apollo-development/opisy/descriptions:latest', type='dataset')
artifact_dir = artifact.download()

In [None]:
def generate(data_point):
    return {
         'text': [
            f"Jako copywriter opisz podany produkt\n\n{category}, {name}\n\n###\n\n{description}"
            for category, name, description in zip(data_point['category'], data_point['name'], data_point['description'])
        ]
    }

def tokenize(examples):
    return tokenizer(examples['text'], truncation=True)

dataset = load_dataset("csv", data_files=artifact_dir+'description.csv')
dataset = dataset.map(generate, batched=True)

tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=['name', 'description', 'category'])
tokenized_dataset = tokenized_dataset.with_format("torch")
split_datasets = tokenized_dataset['train'].train_test_split(test_size=0.1, seed=42, shuffle=True)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
train_dataloader = DataLoader(
    split_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator
)
test_dataloader = DataLoader(
    split_datasets["test"], batch_size=16, shuffle=True, collate_fn=data_collator
)

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-6da952fc86a7d14c/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-6da952fc86a7d14c/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


**Finetuning**

In [None]:
import transformers
from transformers import TrainingArguments
from datasets import load_metric

output_dir = "/mgpt"

training_args = TrainingArguments(
    #paths
    output_dir=output_dir,
    overwrite_output_dir=True,
    #optimization
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    #training params
    num_train_epochs=3,
    learning_rate=3e-4,
    optim="adamw_torch",
    fp16=True,
    warmup_steps=500,
    #smart batching
    group_by_length=True,
    #eval
    load_best_model_at_end=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    #wandb
    report_to="wandb"
    run_name="mgpt-1"
)

trainer = transformers.Trainer(model=model, 
            train_dataset=split_datasets['train'], 
            eval_dataset=split_datasets['test'],
            args=training_args,
            data_collator=data_collator
)

result = trainer.train(resume_from_checkpoint=False)
model.save_pretrained(output_dir)

wandb.finish()

# **Inference**

In [None]:
def generate_text(sequence, max_length):
    inference_model = AutoModelForCausalLM.from_pretrained(output_dir)
    inference_model.to(device)
    inference_model.eval()

    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    # if device is "cuda":
    ids = ids.cuda()

    final_outputs = inference_model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=inference_model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

generate_text("Jako copywriter opisz podaney produkt\n\nSmartfon, OnePlus Nord 5G 8GB/128GB Blue Marble\n\n###\n\n", 128)

Napisz reklamę jednego produktu.

<Nazwa>
Komputer Acer Legion

<Reklama>
Nie trać czasu! Kup teraz laptop ASUS, który idealnie nadaje się do pracy i rozrywki - procesor Intel Core oraz pamięć RAM o pojemności 64 GB sprawią że praca stanie sie czystą przyjemnością!. Idealny dla osób pracujących w biurze lub na uczelni czy też użytkowników sprzętu domowego użytku (np: telewizor). Dzięki zastosowaniu najnowszych technologii zyskasz dostęp online wszędzie tam gdzie potrzebujesz szybkiego dostępu z dowolnego miejsca dzięki czemu oszczędzisz cenny czas poświęcany przez komputer a Ty będziesz miał możliwość zaoszczędzenia sobie cennego wydatku np zakup nowego tabletu bądź smartfona(...). Zobacz
