<a href="https://colab.research.google.com/github/TurkuNLP/Deep_Learning_in_LangTech_course/blob/master/ex4_parameters.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [6]:
!pip3 install -q transformers datasets evaluate accelerate

In [7]:
from pprint import pprint
import logging

logging.disable(logging.INFO)

---
# Download and prepare data

In [8]:
import datasets

dataset = datasets.load_dataset('imdb')
dataset = dataset.shuffle() #This is never a bad idea, datasets may have ordering to them, which is not what we want
del dataset["unsupervised"] # Delete the unlabeled part of the dataset to make things faster

---

# Tokenize and vectorize data

In [9]:
import transformers

model_name = "bert-base-cased"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

# Define a simple function that applies the tokenizer
def tokenize(example):
    return tokenizer(
        example["text"],
        max_length=128,
        truncation=True,
    )

# Apply the tokenizer to the whole dataset using .map()
dataset = dataset.map(tokenize)



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

---

# Define model

(Note that here we define the model structure and computation without setting any parameters yet!)

In [10]:
import torch


# This gives a new name to the config class, just for convenience
BasicConfig = transformers.PretrainedConfig


# This is the model
class SimpleCNN(transformers.PreTrainedModel):

    config_class = BasicConfig

    # In the initialization method, one instantiates the layers
    # these will be the parameters of the model
    def __init__(self, config):
        super().__init__(config)
        # Embedding layer: vocab size x embedding dim
        self.embeddings = torch.nn.Embedding(
            num_embeddings=config.vocab_size,
            embedding_dim=config.embedding_dim
        )
        # Convolution layer: TODO
        self.convolution = torch.nn.Conv1d(
            config.embedding_dim,
            config.num_filters,
            config.filter_size,
            padding=1
        )
        # Activation function following convolution
        self.activation = torch.nn.ReLU()
        # Pooling layer: global max pooling, regardless of input length
        self.pooling_layer = torch.nn.AdaptiveMaxPool1d(
            output_size=1
        )
        # Output layer: num filters to output size
        self.output_layer = torch.nn.Linear(
            in_features=config.num_filters,
            out_features=config.num_labels
        )
        # Loss function: standard loss for classification
        self.loss = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, labels=None, attention_mask=None):
        #shape of input: [batch_size, maxlen]
        x = self.embeddings(input_ids)
        #shape of x: [batch_size, maxlen, embedding_dim]
        x = x.permute((0,2,1))
        #shape of x: [batch_size, embedding_dim, maxlen]
        x = self.convolution(x)
        #shape of x: [batch_size, filters, maxlen]
        x = self.activation(x)
        #shape of x: [batch_size, filters, maxlen]
        x = self.pooling_layer(x)
        #shape of x: [batch_size, filters, 1]
        x = x.flatten(start_dim=1)
        #shape of x: [batch_size, filters]
        output = self.output_layer(x)

        # Return value computed as in the MLP:
        if labels is not None:
            # We have labels, so we can calculate the loss
            return (self.loss(output,labels), output)
        else:
            # No labels, so just return the output
            return (output,)

---
# Define training support

(Collator, evaluation, Callbacks)

In [11]:
import evaluate

# evaluation
accuracy = evaluate.load("accuracy")

def compute_accuracy(outputs_and_labels):
    outputs, labels = outputs_and_labels
    predictions = outputs.argmax(axis=-1) #pick the index of the "winning" label
    return accuracy.compute(predictions=predictions, references=labels)

# collator
data_collator = transformers.DataCollatorWithPadding(tokenizer)

# Callbacks / logging
from collections import defaultdict

class LogSavingCallback(transformers.TrainerCallback):
    def on_train_begin(self, *args, **kwargs):
        self.logs = defaultdict(list)
        self.training = True

    def on_train_end(self, *args, **kwargs):
        self.training = False

    def on_log(self, args, state, control, logs, model=None, **kwargs):
        if self.training:
            for k, v in logs.items():
                if k != "epoch" or v not in self.logs[k]:
                    self.logs[k].append(v)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

---
# Hyperparameter search - First option

In [12]:
for lr in [0.000005, 0.00005, 0.0005, 0.005, 0.05, 0.5]:

    # create the model
    config = BasicConfig(
        vocab_size = tokenizer.vocab_size,
        num_labels = len(set(dataset['train']['label'])),
        embedding_dim = 64,
        filter_size = 3,
        num_filters = 10,
    )

    model = SimpleCNN(config)

    # Set training arguments
    trainer_args = transformers.TrainingArguments(
        "checkpoints",
        evaluation_strategy="steps",
        logging_strategy="steps",
        load_best_model_at_end=True,
        eval_steps=500,
        logging_steps=500,
        learning_rate=lr, # <--- parameter goes here
        per_device_train_batch_size=8,
        max_steps=2500,
    )

    trainer = transformers.Trainer(
        model=model,
        args=trainer_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        compute_metrics=compute_accuracy,
        data_collator=data_collator,
        callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=5), LogSavingCallback()]
    )

    trainer.train()
    eval_results = trainer.evaluate(dataset["test"])
    print('Learning rate:', lr, 'Accuracy:', eval_results['eval_accuracy'])

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss,Accuracy
500,0.9646,0.964276,0.5
1000,0.96,0.937627,0.5
1500,0.9492,0.919602,0.5
2000,0.916,0.909444,0.5
2500,0.9066,0.90623,0.5


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 5e-06 Accuracy: 0.5


Step,Training Loss,Validation Loss,Accuracy
500,0.6967,0.695665,0.50992
1000,0.6956,0.693803,0.51804
1500,0.6934,0.692734,0.52336
2000,0.6918,0.691876,0.52556
2500,0.6918,0.691665,0.52612


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 5e-05 Accuracy: 0.52612


Step,Training Loss,Validation Loss,Accuracy
500,0.6881,0.673649,0.57884
1000,0.6642,0.652278,0.59936
1500,0.638,0.629875,0.63924
2000,0.6194,0.619752,0.6478
2500,0.6168,0.616634,0.65312


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.0005 Accuracy: 0.65312


Step,Training Loss,Validation Loss,Accuracy
500,0.6865,0.637228,0.62624
1000,0.6159,0.664938,0.62276
1500,0.5826,0.574918,0.70132
2000,0.5491,0.561285,0.70672
2500,0.5492,0.534774,0.72672


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.005 Accuracy: 0.72672


Step,Training Loss,Validation Loss,Accuracy
500,0.6865,0.637228,0.62624
1000,0.6159,0.664938,0.62276
1500,0.5826,0.574918,0.70132
2000,0.5491,0.561285,0.70672
2500,0.5492,0.534774,0.72672


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.005 Accuracy: 0.72672


Step,Training Loss,Validation Loss,Accuracy
500,1.332,1.047678,0.55192
1000,1.2468,0.884091,0.5746
1500,0.9887,0.730061,0.61072
2000,0.8373,0.830893,0.60936
2500,0.6146,0.571852,0.7134


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.05 Accuracy: 0.7134


Step,Training Loss,Validation Loss,Accuracy
500,373.8849,464.389679,0.50108
1000,434.3002,237.639359,0.52312
1500,248.0971,204.130096,0.5044
2000,118.0842,35.678982,0.49376
2500,30.1166,1.654839,0.5198


Learning rate: 0.5 Accuracy: 0.5198


---
# Hyperparameter search – Second option

* Hyperparameter search using [Optuna](https://optuna.org/)

In [13]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.0.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.8/362.8 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.13.2-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.0/233.0 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [18]:
import optuna

def objective(trial):
    # Define the search space for hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 5e-4, 5e-2, log=True)
    num_filters = trial.suggest_categorical("num_filters", [10, 16, 24])

    # create the model
    config = BasicConfig(
        vocab_size = tokenizer.vocab_size,
        num_labels = len(set(dataset['train']['label'])),
        embedding_dim = 64,
        filter_size = 3,
        num_filters = num_filters, # <--- parameter goes here
    )

    model = SimpleCNN(config)

    # Set training arguments
    trainer_args = transformers.TrainingArguments(
        "checkpoints",
        evaluation_strategy="steps",
        logging_strategy="steps",
        load_best_model_at_end=True,
        eval_steps=500,
        logging_steps=500,
        learning_rate=learning_rate, # <--- parameter goes here
        per_device_train_batch_size=8,
        max_steps=2500,
    )

    trainer = transformers.Trainer(
        model=model,
        args=trainer_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        compute_metrics=compute_accuracy,
        data_collator=data_collator,
        callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=5), LogSavingCallback()]
    )

    trainer.train()
    eval_results = trainer.evaluate(dataset["test"])
    print('Learning rate:', learning_rate, 'Filters:', num_filters, 'Accuracy:', eval_results['eval_accuracy'])
    return eval_results['eval_accuracy']



study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=3) # <--- How many trials we run, more would be needed in real case!

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss,Accuracy
500,0.6722,0.611683,0.65632
1000,0.609,0.637774,0.63488
1500,0.5757,0.579635,0.69488
2000,0.5529,0.561009,0.70944
2500,0.5433,0.546472,0.72072


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.0032891344286570525 Filters: 10 Accuracy: 0.72072


Step,Training Loss,Validation Loss,Accuracy
500,0.6851,0.636767,0.62736
1000,0.6069,0.614509,0.65696
1500,0.5654,0.556262,0.7114
2000,0.5218,0.534269,0.72456
2500,0.5216,0.522814,0.73228


max_steps is given, it will override any value given in num_train_epochs


Learning rate: 0.002434791974958158 Filters: 16 Accuracy: 0.73228


Step,Training Loss,Validation Loss,Accuracy
500,1.6043,0.883112,0.59232
1000,1.5142,1.527417,0.61364
1500,1.3915,0.802837,0.65484
2000,0.7695,0.660751,0.68248
2500,0.6183,0.536751,0.7332


Learning rate: 0.03601436163120907 Filters: 24 Accuracy: 0.7332
