In [14]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig, Trainer, TrainingArguments

import sys
sys.path.append('')
import time
from sklearn.model_selection import train_test_split
from transformers_interpret import SequenceClassificationExplainer
import torch
from typing import Dict, List, Optional, Tuple, Union
from transformers import PreTrainedModel, PreTrainedTokenizer

args = {}
args['SCRIPTDIR'] = ''
args['MODEL_DIR'] = ''
args['MODEL_NAME'] = 'xlm-roberta-base'
args['DATA_PATH'] = 'data/raw/'
args['DATA_NAME'] = 'exalt_emotion_train.tsv'

args['learning_rate'] = 4e-5
args['per_device_train_batch_size'] = 4
args['per_device_eval_batch_size'] = 4
args['num_train_epochs'] = 1
args['max_steps'] = 2
args['weight_decay'] = 0.01
args['evaluation_strategy'] = 'steps'
args['eval_steps'] = 2
args['save_strategy'] = 'no'
args['gradient_accumulation_steps'] = 1
args['bf16'] = 'false'
args['trainable'] = 'output'
args['deepspeed'] = ''

args['save_steps'] = -1

SAVE_DIR = args["SCRIPTDIR"] + 'resultT/' + \
    (
        args["MODEL_NAME"] +'='+ \
        args["DATA_NAME"][:-4] + '=' + \
        args["trainable"] + 'PT=' + \
        f'lr{args["learning_rate"]:0.0e}='.replace('-0','') + \
        f'bs{args["per_device_train_batch_size"]*8*args["gradient_accumulation_steps"]}=' + \
        ('bf16' if args["bf16"].lower()=='true' else 'fp16') + '=' + \
        time.strftime('%m%d-%H%M%S', time.localtime())
    )

train_args = dict(
    output_dir = SAVE_DIR,
    learning_rate = args["learning_rate"],
    per_device_train_batch_size = args["per_device_train_batch_size"],
    per_device_eval_batch_size = args["per_device_eval_batch_size"],
    num_train_epochs = args["num_train_epochs"],
    max_steps = args["max_steps"],
    weight_decay = args["weight_decay"],
    evaluation_strategy = args["evaluation_strategy"],
    eval_steps = args["eval_steps"],
    save_strategy = args["save_strategy"],
    # load_best_model_at_end=True,
    # save_total_limit=2,
    overwrite_output_dir=True,

    gradient_accumulation_steps = args["gradient_accumulation_steps"],
    logging_strategy = 'epoch',
    bf16 = args["bf16"].lower()=='true',
)

if len(args["deepspeed"]) > 0 :
    train_args['deepspeed'] = args["deepspeed"]
if args["save_steps"]>=0:
    train_args['save_steps'] = args["save_steps"]

MODEL_NAME = args["MODEL_DIR"] + args["MODEL_NAME"]
MODEL_NAME = ''


In [9]:
import pandas as pd
import os

from datasets import Dataset, DatasetDict, load_metric
traindata = pd.read_csv(args["SCRIPTDIR"] + "data/raw/exalt_triggers_train_token.tsv", sep="\t")
traindata['Labels'] = traindata['Labels'].apply(eval)
traindata['tokens'] = traindata['tokens'].apply(eval)
traindf, devdf = train_test_split(traindata, test_size=0.0001, random_state=42)

datasets = DatasetDict({
    "train": Dataset.from_pandas(traindf),
    "dev" : Dataset.from_pandas(devdf),
    "test": Dataset.from_pandas(devdf)
    })

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["Labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# a=tokenize_and_align_labels(datasets['train'][:1])
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/2999 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
                                                                 

In [15]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME,num_labels=2)

def freeze(module,status=False):
    for parameter in module.parameters():
        parameter.requires_grad = status

if args["trainable"].lower()=='output':
    freeze(model.base_model)
elif args["trainable"].lower()=='inner':
    freeze(model.base_model.embeddings)

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at /work/share/public/weights/xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import numpy as np

# Errors occur
# metric = load_metric("seqeval")
# def compute_metrics(p): 
#     predictions, labels = p
#     predictions = np.argmax(predictions, axis=2)

#     # Remove ignored index (special tokens)
#     true_predictions = [
#         [p for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     true_labels = [
#         [l for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]

#     results = metric.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }

# Note that the results don't match those on codalab
def compute_metrics(eval_pred): 
    metric1 = load_metric(args["SCRIPTDIR"] + 'starters_kit/metric/' + "precision.py", trust_remote_code=True)
    metric2 = load_metric(args["SCRIPTDIR"] + 'starters_kit/metric/' + "recall.py", trust_remote_code=True)
    metric3 = load_metric(args["SCRIPTDIR"] + 'starters_kit/metric/' + "f1.py", trust_remote_code=True)
    metric4 = load_metric(args["SCRIPTDIR"] + 'starters_kit/metric/' + "accuracy.py", trust_remote_code=True)
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)

    true_predictions = [
        p for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label) if l != -100
    ]
    true_labels = [
        l for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label) if l != -100
    ]


    precision = metric1.compute(predictions=true_predictions, references=true_labels, average="macro")["precision"]
    recall = metric2.compute(predictions=true_predictions, references=true_labels, average="macro")["recall"]
    f1 = metric3.compute(predictions=true_predictions, references=true_labels, average="macro")["f1"]
    accuracy = metric4.compute(predictions=true_predictions, references=true_labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}


In [19]:
training_args = TrainingArguments(
    **train_args
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=2, training_loss=0.4004236161708832, metrics={'train_runtime': 0.1334, 'train_samples_per_second': 59.976, 'train_steps_per_second': 14.994, 'total_flos': 200055329376.0, 'train_loss': 0.4004236161708832, 'epoch': 0.0})