<a href="https://colab.research.google.com/github/TurkuNLP/Deep_Learning_in_LangTech_course/blob/master/hf_trainer_mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -nc https://raw.githubusercontent.com/TurkuNLP/sentiment-target-corpus/main/sentiment-target-fi.tsv
!pip3 install transformers datasets

# Prep data to a suitable format

* You really only need to do this once
* Make a *json lines* file with one json-encoded example per line
* Each example has the `text` and the `label` as an integer
* We have four different labels in this particular data

In [2]:
import re
import json
import random

label_names=["positive","negative","reject","neither"]
data=[]
with open("sentiment-target-fi.tsv") as f:
    for line in f:
        line=line.rstrip("\n")
        if not line or line.startswith("#"): #skip empty and comments
            continue
        cols=line.split("\t")
        if len(cols)!=5: #skip weird lines that don't have the right number of columns
            continue
        data.append(cols)
random.shuffle(data) #shake well
with open("sentiment-data.jsonl","wt") as f: #write out as jsonl
    for cols in data:
        txt=cols[1]
        item={"label":label_names.index(cols[2]),"text":cols[1]} #note here we translate from label strings to integers
        print(json.dumps(item,ensure_ascii=False,sort_keys=True),file=f)

#One line looks like this:
# {"label": 0, "text": "En tiedä mitä kuvanvalmistamoa käytät, mutta ainakin <TARGET>Fotoyksillä</TARGET> onnistuu helposti."}


# Datasets

Every popular framework has its own preferred idea of how to represent data. Let us look into the Hugging Face datasets which is very popular, so it makes sense to be acquainted with it.



In [2]:
import datasets

fname="sentiment-data.jsonl"
dset=datasets.load_dataset('json',                             # Format of the data
                           data_files={"everything":fname},    # All data files, here we only have one
                           split={"train":"everything[:80%]",  # First 80% is the train set
                                  "validation":"everything[80%:90%]",   # Next 10% is the validation/dev set
                                  "test":"everything[90%:]"},           # last 10% is the test set
                           features=datasets.Features({ #And here we tell how to interpret the data attributes
                               "label":datasets.ClassLabel(names=["positive","negative","neither","reject"]),
                               "text":datasets.Value("string")})
                           )                           


Using custom data configuration default-54a75f354cd815b3


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-54a75f354cd815b3/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-54a75f354cd815b3/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Tokenize and translate into integers

* One can use a pre-existing tokenizer
* It will, by default, produce `input_ids` which translates text tokens to integers
               

In [9]:
import transformers
tokenizer=transformers.AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

tokenized=tokenizer("Minulla on simpukkakoira",add_special_tokens=False) #nevermind special tokens, their time will come :)
print(tokenized)

#nevermind token_type_ids and attention_mask, their time will come :)
#

print(tokenizer.convert_ids_to_tokens(tokenized["input_ids"]))

{'input_ids': [3668, 145, 22966, 1233, 16323], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}
['Minulla', 'on', 'simp', '##ukka', '##koira']


In [10]:
# Apply the tokenizer to the whole dataset

dset=dset.map(lambda x: tokenizer(x["text"],add_special_tokens=False))

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [12]:
print(dset["train"][0])

{'label': 0, 'text': 'En tiedä mitä kuvanvalmistamoa käytät, mutta ainakin <TARGET>Fotoyksillä</TARGET> onnistuu helposti.', 'input_ids': [555, 1632, 382, 4053, 13615, 23229, 16818, 119, 304, 1120, 5571, 16307, 50051, 50073, 12355, 2377, 9825, 178, 18406, 5571, 499, 16307, 50051, 50073, 12355, 2377, 6500, 2698, 111], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


# Input encoding

* The simplest way is to set every input that is present to 1, rest at 0
* So e.g. if our input has vocab size of 5 and tokens `[0,3]` present, we would like to get `[1,0,0,1,0]` on the input
* The simple code below does just that:

In [41]:
import torch
# These are the ids which we want to set to 1
input_ids=torch.tensor([[0,0,1],[0,2,3]])
# These are the 1s we will be copying over
ones=torch.ones_like(input_ids,dtype=torch.float)
# This is the target, initialized to zeros
zeros=torch.zeros((2,5))
# Scatter says: 
#   work on dimension 1
#   `input_ids` are the indices to set
#   `ones` are the values to set
zeros=zeros.scatter(1,input_ids,ones)
print(zeros)
# see how in the first row indices 0 and 1 are set to 1
# and in the second row indices 0,2,3 are set to 1
# exactly as it was supposed to be!

tensor([[1., 1., 0., 0., 0.],
        [1., 0., 1., 1., 0.]])


# Build the model

* Model in its simplest form has `__init__()` which instantiates the layers and `forward()` which implements the actual computation

In [None]:
import torch

# A model wants a config, I can simply inherit from the base
# class for pretrained configs
class MLPConfig(transformers.PretrainedConfig):
    pass

# This is the model
class MLP(transformers.PreTrainedModel):

    # In the initialization method, one instantiates the layers
    # these will be the parameters of the model
    def __init__(self,config):
        super().__init__(config)
        self.vocab_size=config.vocab_size
        # Hidden layer: input size x hidden size
        self.hidden=torch.nn.Linear(in_features=self.vocab_size,out_features=config.hidden_size)
        # Output layer: hidden size x output size
        self.output=torch.nn.Linear(in_features=config.hidden_size,out_features=config.nlabels)
        
    # The computation of the model is put into the forward() function
    # it receives a batch of data and optionally the correct `labels`
    #
    # If given `labels` it returns (loss,output)
    # if not, then it returns (output,)
    def forward(self,input_ids,labels=None,**kwargs):
        # The batch is in input_ids
        batch_size=input_ids.shape[0] #this is how many examples we have
        # The following block converts the input ids into a suitable input for
        # the input layer
        input=torch.zeros((batch_size,self.vocab_size),dtype=torch.float,device=input_ids.device)
        ones=torch.ones_like(input_ids,dtype=torch.float)
        input.scatter_(1,input_ids,ones)
        #print(input.sum(-1))
        #print(input)
        projected=torch.tanh(self.hidden(input))
        logits=self.output(projected)
        #print(logits)
        if labels is not None:
            loss=torch.nn.CrossEntropyLoss()
            return (loss(logits,labels),logits)
        else:
            return (logits,)

# Configure the model:
#   these parameters are used in the model's __init__()
mlp_config=MLPConfig(vocab_size=tokenizer.vocab_size,hidden_size=10,nlabels=4)

# Instantiate the model  
mlp=MLP(mlp_config)


# Model training

* Hugging Face trainer
  * Loads of arguments that control the training
  * data collator builds the batches
  * early stopping callback stops when eval loss no longer improves
  

In [15]:
trainer_args=transformers.TrainingArguments("xxx",
                                            evaluation_strategy="steps",
                                            logging_strategy="steps",
                                            eval_steps=100,
                                            logging_steps=100,
                                            learning_rate=5e-4,
                                            max_steps=10000,
                                            load_best_model_at_end=True)
data_collator=transformers.DataCollatorWithPadding(tokenizer)
early_stopping=transformers.EarlyStoppingCallback(5) #5 steps worth of patience before early stopping
trainer=transformers.Trainer(model=mlp,
                             args=trainer_args,
                             train_dataset=dset["train"],
                             eval_dataset=dset["validation"],
                             data_collator=data_collator,
                             callbacks=[early_stopping])
trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
max_steps is given, it will override any value given in num_train_epochs
The following columns in the training set  don't have a corresponding argument in `MLP.forward` and have been ignored: text, token_type_ids, attention_mask. If text, token_type_ids, attention_mask are not expected by `MLP.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1864
  Num Epochs = 43
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10000


Step,Training Loss,Validation Loss
100,0.9457,0.985943
200,0.8812,0.960062
300,0.7921,0.941356
400,0.7369,0.92833
500,0.7417,0.92494
600,0.6526,0.914526
700,0.5936,0.9051
800,0.4987,0.907102
900,0.4986,0.915164
1000,0.4207,0.920289


The following columns in the evaluation set  don't have a corresponding argument in `MLP.forward` and have been ignored: text, token_type_ids, attention_mask. If text, token_type_ids, attention_mask are not expected by `MLP.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 233
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `MLP.forward` and have been ignored: text, token_type_ids, attention_mask. If text, token_type_ids, attention_mask are not expected by `MLP.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 233
  Batch size = 8
The following columns in the evaluation set  don't have a corresponding argument in `MLP.forward` and have been ignored: text, token_type_ids, attention_mask. If text, token_type_ids, attention_mask are not expected by `MLP.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2

TrainOutput(global_step=1600, training_loss=0.5424937748908997, metrics={'train_runtime': 34.9112, 'train_samples_per_second': 2291.53, 'train_steps_per_second': 286.441, 'total_flos': 3530974662336.0, 'train_loss': 0.5424937748908997, 'epoch': 6.87})

In [16]:
p=trainer.predict(dset["test"])


The following columns in the test set  don't have a corresponding argument in `MLP.forward` and have been ignored: text, token_type_ids, attention_mask. If text, token_type_ids, attention_mask are not expected by `MLP.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 233
  Batch size = 8


In [18]:
print(p.predictions)

[[ 1.04914296e+00  5.72185159e-01 -1.91130626e+00  5.11026263e-01]
 [ 9.78207827e-01  1.43302119e+00 -2.65995216e+00  1.97828934e-01]
 [ 5.31775296e-01  5.72758913e-01 -1.70715511e+00  8.05471420e-01]
 [ 1.14321506e+00  8.61916065e-01 -2.04135013e+00  2.33791605e-01]
 [ 1.22191906e+00  5.57776093e-01 -1.86938167e+00  3.29600751e-01]
 [ 2.50705147e+00 -1.94108486e-02 -2.24513102e+00  8.73181671e-02]
 [ 1.05232072e+00  5.06227016e-01 -1.75907099e+00  4.51166511e-01]
 [ 4.51592386e-01  9.32101369e-01 -1.81697226e+00  5.79419374e-01]
 [-9.62725639e-01  2.27181244e+00 -1.90531743e+00  7.14016080e-01]
 [ 1.90168309e+00  2.26994038e-01 -1.87090278e+00  1.43071696e-01]
 [ 1.41481280e+00  5.27270436e-01 -1.96029711e+00  3.15003723e-01]
 [ 5.73736668e-01  8.75775576e-01 -1.75681579e+00  5.11473536e-01]
 [ 9.03503895e-01  7.41652608e-01 -1.72133577e+00  3.15692782e-01]
 [ 1.15450799e+00  7.70880580e-01 -1.94459653e+00  2.53675342e-01]
 [ 5.81374168e-02  1.87212539e+00 -2.09299469e+00  2.71690309e

In [26]:
predictions=p.predictions.argmax(-1)
print("Predicted",predictions)
print(p.label_ids)
print(sum(p.label_ids==predictions)/len(predictions))


Predicted [0 1 3 0 0 0 0 1 1 0 0 1 0 0 1 0 0 1 3 0 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1 0 3 0 0 1 1 1 1 1 0 3
 1 1 0 1 1 1 1 0 0 1 0 3 0 1 1 0 1 1 1 0 1 1 1 0 1 0 1 3 0 1 0 3 1 0 1 1 0
 0 0 0 0 0 0 0 0 0 1 0 3 1 0 3 1 1 0 0 1 1 3 0 1 0 1 1 1 1 0 0 0 1 1 0 0 3
 0 1 1 0 1 3 1 1 1 1 1 3 0 1 0 1 0 0 1 1 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 1 0
 3 0 0 0 1 1 1 0 3 1 0 0 0 1 0 1 0 0 1 1 1 0 0 0 0 0 1 1 1 0 1 0 0 1 1 1 1
 0 0 1 3 0 1 1 1 1 1 1]
[0 1 1 1 0 1 0 3 1 0 0 1 1 1 1 0 0 1 1 3 0 1 3 0 3 0 0 1 1 3 1 3 3 1 0 0 0
 1 0 3 0 1 1 0 1 0 0 0 1 3 3 0 3 0 0 0 1 1 0 0 0 3 0 0 1 0 1 1 1 1 3 1 3 0
 1 1 1 1 3 1 1 0 0 3 0 3 0 0 1 1 3 1 1 3 1 3 1 0 1 0 1 1 0 1 0 0 1 0 3 1 0
 0 1 0 0 0 1 0 0 3 3 0 3 3 0 3 1 0 0 1 3 3 3 0 0 0 0 0 1 0 1 0 0 0 3 0 0 0
 1 1 1 3 1 1 1 1 1 1 1 0 0 3 0 1 1 1 1 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 0 1 3
 3 3 0 0 0 1 0 0 3 3 0 0 1 1 0 1 0 3 1 0 0 0 1 0 0 1 1 0 1 0 0 3 0 1 3 3 0
 0 0 0 3 0 0 3 1 3 0 0]
0.5836909871244635
