In [1]:
!pip install transformers
!pip install sentencepiece
!pip install datasets



In [2]:
import torch
import os
import sentencepiece as sp
from transformers import AlbertTokenizer, AlbertTokenizerFast
from transformers import CamembertModel, CamembertConfig, CamembertForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline
import math

from transformers import (
                          CONFIG_MAPPING,
                          MODEL_FOR_MASKED_LM_MAPPING,
                          MODEL_FOR_CAUSAL_LM_MAPPING,
                          PreTrainedTokenizer,
                          TrainingArguments,
                          AutoConfig,
                          AutoTokenizer,
                          AutoModelWithLMHead,
                          AutoModelForCausalLM,
                          AutoModelForMaskedLM,
                          LineByLineTextDataset,
                          TextDataset,
                          DataCollatorForLanguageModeling,
                          DataCollatorForWholeWordMask,
                          DataCollatorForPermutationLanguageModeling,
                          PretrainedConfig,
                          Trainer,
                          set_seed,
                          )
from datasets import load_dataset
set_seed(42)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

Device: cuda:0


# Data

### Pretraining-data
NORWEGIAN: A small section of the OSCAR corpus (1.1GB roughly)

In [5]:
vocab_size = 20000
#nor_dataset_train_path = '/content/oscar_deduplicatd_no_part_3.txt'
nor_dataset_train_path = './content/no_bokmaal-ud-train.txt'
nor_dataset_eval_path = './content/no_bokmaal-ud-dev.txt'
tokenizer_path = 'norwegian_model/tokenizer'
model_path = 'norwegian_model/model'

if not os.path.exists('norwegian_model'):
    os.makedirs('norwegian_model')
    os.makedirs('norwegian_model/tokenizer')
    os.makedirs('norwegian_model/model')

In [6]:
prefix = 'nor_spiece'
sp.SentencePieceTrainer.train(input=nor_dataset_train_path, model_prefix=prefix, vocab_size=vocab_size)
os.rename(prefix+'.model',tokenizer_path+'/spiece.model')
os.rename(prefix+'.vocab',tokenizer_path+'/spiece.vocab')
#nor_dataset = load_dataset('text', data_files={'train': nor_dataset_train_path})


FileExistsError: [WinError 183] Kan ikke opprette en fil når filen allerede finnes: 'nor_spiece.model' -> 'norwegian_model/tokenizer/spiece.model'

In [7]:
norwegian_tokenizer = AlbertTokenizer.from_pretrained(tokenizer_path)
norwegian_tokenizer.save_pretrained(tokenizer_path)


('norwegian_model/tokenizer\\tokenizer_config.json',
 'norwegian_model/tokenizer\\special_tokens_map.json',
 'norwegian_model/tokenizer\\spiece.model',
 'norwegian_model/tokenizer\\added_tokens.json')

In [8]:
batch_size = 32

configuration = CamembertConfig(
    hidden_size = 512,
    num_attention_heads = 8,
    num_hidden_layers = 10,
    max_position_embeddings = 256,
    vocab_size = vocab_size,
)

training_args = TrainingArguments(
    output_dir=model_path+'/output',
    overwrite_output_dir=True,
    do_train=True, 
    do_eval=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    logging_steps=500,
    prediction_loss_only=True,
    learning_rate = 5e-5,
    num_train_epochs = 5,
    save_steps = 20000/batch_size,
)


In [9]:
model = CamembertForMaskedLM(configuration).to(device)
print(model.config)
print("Num parameters", model.num_parameters())

norwegian_tokenizer = AlbertTokenizerFast.from_pretrained(tokenizer_path)

# Resize model to fit all tokens in tokenizer.
model.resize_token_embeddings(len(norwegian_tokenizer))

CamembertConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 256,
  "model_type": "camembert",
  "num_attention_heads": 8,
  "num_hidden_layers": 10,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 20000
}

Num parameters 52676640


Embedding(20004, 512)

In [10]:
train_dataset = LineByLineTextDataset(
    tokenizer=norwegian_tokenizer,
    file_path=nor_dataset_train_path,
    block_size=256,
)
eval_dataset = LineByLineTextDataset(
    tokenizer=norwegian_tokenizer,
    file_path=nor_dataset_eval_path,
    block_size=256,
)
data_collator = DataCollatorForWholeWordMask(
    tokenizer=norwegian_tokenizer, 
    mlm_probability=0.15,
)



In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [12]:
trainer.train()
trainer.save_model()

***** Running training *****
  Num examples = 17820
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2785


Epoch,Training Loss,Validation Loss
1,7.44,6.973102
2,6.8308,6.806986
3,6.6911,6.686832
4,6.6057,6.681787
5,6.5403,6.662507


***** Running Evaluation *****
  Num examples = 2621
  Batch size = 32
Saving model checkpoint to norwegian_model/model/output\checkpoint-625
Configuration saved in norwegian_model/model/output\checkpoint-625\config.json
Model weights saved in norwegian_model/model/output\checkpoint-625\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2621
  Batch size = 32
Saving model checkpoint to norwegian_model/model/output\checkpoint-1250
Configuration saved in norwegian_model/model/output\checkpoint-1250\config.json
Model weights saved in norwegian_model/model/output\checkpoint-1250\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2621
  Batch size = 32
Saving model checkpoint to norwegian_model/model/output\checkpoint-1875
Configuration saved in norwegian_model/model/output\checkpoint-1875\config.json
Model weights saved in norwegian_model/model/output\checkpoint-1875\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2621
  Batch size = 32
Savin

In [13]:
eval_output = trainer.evaluate()
# compute perplexity from model loss.

perplexity = math.exp(eval_output["eval_loss"])
print('\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))

***** Running Evaluation *****
  Num examples = 2621
  Batch size = 32



Evaluate Perplexity:     745.06
