# Generating text and Fine-tuning
* https://imsdb.com/

In [None]:
!pip install transformers

In [2]:
from transformers import GPT2LMHeadModel, AutoTokenizer, Trainer, TrainingArguments, LineByLineTextDataset, DataCollatorForLanguageModeling
import argparse
import os
from pathlib import Path

def load_model_and_tokenizer(pre_trained):
  tokenizer = AutoTokenizer.from_pretrained(pre_trained)
  tokenizer.pad_token = tokenizer.eos_token
  model = GPT2LMHeadModel.from_pretrained(pre_trained)
  return model, tokenizer

def prepare_text(sentence, tokenizer):
  return tokenizer(sentence, return_tensors='pt')

## Generating Text

In [None]:
model, tokenizer = load_model_and_tokenizer('pierreguillou/gpt2-small-portuguese')

In [None]:
sentence = 'Opa, eu gostaria de comprar'
sentence = prepare_text(sentence, tokenizer)
output = model.generate(**sentence,
                        do_sample=True,
                        max_length=150,
                        early_stopping=True,
                        temperature=0.5,
                        n_beams=5,
                        no_repeat_ngram_size=2,
                        )
tokenizer.decode(output[0], skip_special_tokens=True)

## Fine-tuning!

In [5]:
epochs = 1
dir = Path('cache')
bsz = 32
train_file = Path('./biblia.txt')
language = 'pt' # 'en' 

In [None]:
SAVE_FOLDER = Path(f'{dir}')
if not os.path.exists(str(SAVE_FOLDER)):
    os.mkdir(str(SAVE_FOLDER))

print('Loading tokenizer and model')
if language == 'pt':
  model, tokenizer = load_model_and_tokenizer('pierreguillou/gpt2-small-portuguese')
else:
  model, tokenizer = load_model_and_tokenizer('gpt2')
print('Loaded')

print('Building Datasets')

train_dataset = LineByLineTextDataset(tokenizer=tokenizer,
                               file_path=train_file,
                               block_size=128)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

print('Define arguments')
training_args = TrainingArguments(
    output_dir=SAVE_FOLDER,
    overwrite_output_dir=True,
    num_train_epochs=epochs,
    per_device_train_batch_size=bsz,
    per_device_eval_batch_size=bsz,
    logging_steps=100,

)
print('Create Trainer')
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset)

trainer.train()

trainer.save_model(SAVE_FOLDER)

In [None]:
model = GPT2LMHeadModel.from_pretrained('cache')

In [None]:
sentence = 'Opa, eu gostaria de comprar'
#sentence = 'Hello, how are we gonna '
sentence = prepare_text(sentence, tokenizer)
output = model.generate(**sentence,
                        do_sample=True,
                        max_length=150,
                        early_stopping=True,
                        temperature=0.5,
                        n_beams=5,
                        no_repeat_ngram_size=2,
                        )
tokenizer.decode(output[0], skip_special_tokens=True)