In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [20]:
from datasets.filesystems import S3FileSystem
from datasets import DatasetDict, load_metric, Dataset
import torch
import transformers as tr
from tqdm import tqdm
from pathlib import Path
import shutil

In [91]:
model = tr.AutoModelForSeq2SeqLM.from_pretrained("t5-small")
tokenizer = tr.AutoTokenizer.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
s3fs = S3FileSystem()

In [86]:
dataset = DatasetDict.load_from_disk("s3://traal-storage/datasets/conll2003", fs=s3fs)['train']

In [87]:
import numpy as np

In [88]:
def calculate_mnlp_score(probas) -> np.ndarray:
    return np.array([-np.sum(np.log(np.max(i, axis=1))) / len(i) for i in probas])

In [89]:
def apply_model_fn(examples):
    inputs = tokenizer(examples['tokens'], truncation=True, padding='longest', return_tensors='pt', is_split_into_words=True)
    with torch.no_grad():
        outputs = model(**inputs).logits
        outputs = torch.softmax(outputs, -1).numpy()
    
    uncertainty_estimates = calculate_mnlp_score(outputs)
    return {'score': uncertainty_estimates}


In [90]:
dataset = dataset.map(apply_model_fn, batched=True, batch_size=16)

  0%|          | 0/878 [00:00<?, ?ba/s]

ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds

In [84]:
dataset[np.argsort(dataset['score'])[:2]]

{'tokens': [['Police',
   'said',
   'a',
   'number',
   'of',
   'crew',
   'members',
   'had',
   'left',
   'the',
   'aircraft',
   'and',
   'said',
   'details',
   'would',
   'be',
   'given',
   'at',
   'a',
   'news',
   'conference',
   'expected',
   'to',
   'be',
   'held',
   'in',
   'the',
   'next',
   'few',
   'minutes',
   'by',
   'the',
   'local',
   'police',
   'chief',
   '.'],
  ['At',
   'about',
   '4',
   'a.m.',
   'EDT',
   '(',
   '0800',
   'GMT',
   ')',
   ',',
   'a',
   'group',
   'of',
   'teenaged',
   'girls',
   'were',
   'having',
   'the',
   'overnight',
   'party',
   'in',
   'the',
   'Camelot',
   'subdivision',
   'of',
   'this',
   'eastern',
   'Virginia',
   'city',
   ',',
   'when',
   'a',
   'man',
   'entered',
   'the',
   'house',
   ',',
   'wielding',
   'a',
   'knife',
   ',',
   'threatening',
   'to',
   'sexually',
   'assault',
   'the',
   'girls',
   '.']],
 'ner': [['O',
   'O',
   'O',
   'O',
   'O',
   'O'

In [10]:
id2label = ['O', 'B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER']
label2id = {l:i for i, l in enumerate(id2label)}

In [11]:
def model_init():
    return tr.AutoModelForTokenClassification.from_pretrained("distilbert-base-cased", num_labels=len(id2label))

In [12]:
def convert_label_to_ids(item):
    return {
        'ner': [label2id[x] for x in item['ner']]
    }

In [13]:
dataset = dataset.map(convert_label_to_ids, batched=False, num_proc=2)

    

#0:   0%|          | 0/7021 [00:00<?, ?ex/s]

#1:   0%|          | 0/7021 [00:00<?, ?ex/s]

    

#0:   0%|          | 0/1626 [00:00<?, ?ex/s]

#1:   0%|          | 0/1625 [00:00<?, ?ex/s]

    

#0:   0%|          | 0/1727 [00:00<?, ?ex/s]

#1:   0%|          | 0/1727 [00:00<?, ?ex/s]

In [14]:
tokenizer = tr.AutoTokenizer.from_pretrained("distilbert-base-cased")

In [15]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
dataset = dataset.map(tokenize_and_align_labels, batched=True, batch_size=128, num_proc=2)

    

#0:   0%|          | 0/55 [00:00<?, ?ba/s]

#1:   0%|          | 0/55 [00:00<?, ?ba/s]

    

#1:   0%|          | 0/13 [00:00<?, ?ba/s]

#0:   0%|          | 0/13 [00:00<?, ?ba/s]

    

#1:   0%|          | 0/14 [00:00<?, ?ba/s]

#0:   0%|          | 0/14 [00:00<?, ?ba/s]

In [17]:
def count_n_tokens(dataset: Dataset) -> int:
    counter = 0
    for row in dataset:
        counter += len(row['tokens'])

    return counter

In [18]:
count_n_tokens(dataset['train'])

203621

In [49]:
data_collator = tr.DataCollatorForTokenClassification(tokenizer=tokenizer, pad_to_multiple_of=8)

In [13]:
metric = load_metric("seqeval")
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [47]:
from uuid import uuid4

def create_experiment_id() -> str:
    base_uuid = str(uuid4()).split("-")[-1]
    return base_uuid

In [55]:
create_experiment_id()

'd151ffe85bac'

In [1]:
s3fs

NameError: name 's3fs' is not defined

In [49]:
from datetime import datetime

def get_time_for_saving():
    current_datatime = datetime.now()
    return current_datatime.strftime("%m-%d-%a-%H-%M")

In [50]:
get_time_for_saving()

'05-16-Mon-04-58'

In [21]:
experiment_id = create_experiment_id()
experiment_time = get_time_for_saving()
experiment_type = "conll2003"
experiment_seed = 42
output_dir = Path(f"../experiments/{experiment_type}/{experiment_seed}/{experiment_time}-{experiment_id}")
hp_search_dir = output_dir / "hp_search"

training_args = tr.TrainingArguments(
    output_dir=hp_search_dir,
    evaluation_strategy="epoch",
    save_strategy='epoch',
    disable_tqdm=False,
    group_by_length=True,
    seed=experiment_seed,
    fp16=True, fp16_opt_level="O2",
    metric_for_best_model='eval_f1',
    load_best_model_at_end=True
)
trainer = tr.Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=data_collator,
    model_init=model_init,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /home/rexhaif/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "lab

In [22]:
import optuna

def custom_search_objective(trial: optuna.Trial):
    possible_batch_sizes = [4, 8, 16, 32, 64]
    possible_batch_sizes = [x for x in possible_batch_sizes if (len(dataset['train']) / x) >= 20.0]
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.0, 0.3, step=0.01),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.1, step=0.01),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ['linear', 'constant', 'cosine']),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", possible_batch_sizes)
    }

In [23]:
def compute_objective(metrics):
    return metrics['eval_f1']

In [None]:
best_trial = trainer.hyperparameter_search(
    hp_space=custom_search_objective,
    compute_objective=compute_objective,
    direction="maximize",
    backend="optuna",
    n_trials=10 # number of trials
)

In [46]:
create_experiment_id()

NameError: name 'create_experiment_id' is not defined

In [25]:
def get_checkpoint_dir(base_dir: str, best_trial: tr.trainer_utils.BestRun):
    best_run_dir = Path(base_dir, f"run-{best_trial.run_id}")
    max_step = -1
    max_step_path = None
    for dir in best_run_dir.iterdir():
        step_n = int(dir.parts[-1].split("-")[1])
        if step_n > max_step:
            max_step = step_n
            max_step_path = dir

    return max_step_path


In [26]:
model_dir = get_checkpoint_dir(hp_search_dir, best_trial)

In [27]:
save_dir = Path(output_dir) / "models" / "full"

In [28]:
save_dir.mkdir(parents=True, exist_ok=True)

In [29]:
model_dir.replace(save_dir)

PosixPath('../experiments/conll2003/42/05-15-Sun-18-10-7bfd6675c8d0/models/full')

In [36]:
shutil.rmtree(hp_search_dir)

In [32]:
!ls /home/rexhaif/traal/experiments/conll2003/42/05-15-Sun-18-10-7bfd6675c8d0/models/full

config.json	   rng_state.pth  special_tokens_map.json  trainer_state.json
optimizer.pt	   scaler.pt	  tokenizer_config.json    training_args.bin
pytorch_model.bin  scheduler.pt   tokenizer.json	   vocab.txt
