In [5]:

import os, subprocess
from datasets import Dataset, DatasetDict, load_metric
import pandas as pd
import os
import numpy as np
try:
    import torch
    import torch_npu
    from torch_npu.contrib import transfer_to_npu
except:
    pass

import time
args = {}
args['SCRIPTDIR'] = ''
args['MODEL_DIR'] = ''
args['MODEL_NAME'] = 'xlm-roberta-base'
args['DATA_PATH'] = 'data/raw/'
args['DATA_NAME'] = 'exalt_emotion_train.tsv'

args['learning_rate'] = 4e-5
args['per_device_train_batch_size'] = 4
args['per_device_eval_batch_size'] = 4
args['num_train_epochs'] = 1
args['max_steps'] = 2
args['weight_decay'] = 0.01
args['evaluation_strategy'] = 'steps'
args['eval_steps'] = 2
args['save_strategy'] = 'no'
args['gradient_accumulation_steps'] = 1
args['bf16'] = 'false'
args['trainable'] = 'full'
args['deepspeed'] = ''

args['save_steps'] = -1

SAVE_DIR = args["SCRIPTDIR"] + 'result/' + \
    (
        args["MODEL_NAME"] +'='+ \
        args["DATA_NAME"][:-4] + '=' + \
        args["trainable"] + 'PT=' + \
        f'lr{args["learning_rate"]:0.0e}='.replace('-0','') + \
        f'bs{args["per_device_train_batch_size"]*8*args["gradient_accumulation_steps"]}=' + \
        ('bf16' if args["bf16"].lower()=='true' else 'fp16') + '=' + \
        time.strftime('%m%d-%H%M%S', time.localtime())
    )

train_args = dict(
    output_dir = SAVE_DIR,
    learning_rate = args["learning_rate"],
    per_device_train_batch_size = args["per_device_train_batch_size"],
    per_device_eval_batch_size = args["per_device_eval_batch_size"],
    num_train_epochs = args["num_train_epochs"],
    max_steps = args["max_steps"],
    weight_decay = args["weight_decay"],
    evaluation_strategy = args["evaluation_strategy"],
    eval_steps = args["eval_steps"],
    save_strategy = args["save_strategy"],
    # load_best_model_at_end=True,
    # save_total_limit=2,
    overwrite_output_dir=True,
    gradient_accumulation_steps = args["gradient_accumulation_steps"],
    logging_strategy = 'epoch',
    bf16 = args["bf16"].lower()=='true',
)

if len(args["deepspeed"]) > 0 :
    train_args['deepspeed'] = args["deepspeed"]
if args["save_steps"]>=0:
    train_args['save_steps'] = args["save_steps"]

In [6]:
path_to_trainfile = args["SCRIPTDIR"] + args["DATA_PATH"] + args["DATA_NAME"]
train_file = pd.read_csv(path_to_trainfile, sep="\t")

path_to_dev_file = args["SCRIPTDIR"] + "/data/raw/exalt_emotion_dev_participants.tsv"
dev_file =pd.read_csv(path_to_dev_file, sep="\t")

unique_labels = train_file["Labels"].unique().tolist()
print("{} Unique Labels: {}".format(len(unique_labels), unique_labels))

label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}

from sklearn.model_selection import train_test_split
traindf, devdf = train_test_split(train_file, test_size=0.1, random_state=42)
testdf = dev_file

datasets = DatasetDict({
    "train": Dataset.from_pandas(traindf),
    "dev" : Dataset.from_pandas(devdf),
    "test": Dataset.from_pandas(testdf)
    })


6 Unique Labels: ['Joy', 'Neutral', 'Sadness', 'Love', 'Anger', 'Fear']


In [8]:
MODEL_NAME = args["MODEL_DIR"] + args["MODEL_NAME"] #"xlm-roberta-base"

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    if "Labels" in examples:
        examples["label"] = [label2id[x] for x in examples["Labels"]]
    return tokenizer(examples["Texts"], truncation=True)


tokenized_datasets = datasets.map(preprocess_function, batched=True)

                                                                  

In [10]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=6)

def freeze(module,status=False):
    for parameter in module.parameters():
        parameter.requires_grad = status

if args["trainable"].lower()=='output':
    freeze(model.base_model)
elif args["trainable"].lower()=='inner':
    freeze(model.base_model.embeddings)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at /work/share/public/weights/xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def custom_metrics(eval_pred):
    metric1 = load_metric(args["SCRIPTDIR"] + 'starters_kit/metric/' + "precision.py", trust_remote_code=True)
    metric2 = load_metric(args["SCRIPTDIR"] + 'starters_kit/metric/' + "recall.py", trust_remote_code=True)
    metric3 = load_metric(args["SCRIPTDIR"] + 'starters_kit/metric/' + "f1.py", trust_remote_code=True)
    metric4 = load_metric(args["SCRIPTDIR"] + 'starters_kit/metric/' + "accuracy.py", trust_remote_code=True)
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(predictions=predictions, references=labels, average="macro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="macro")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="macro")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

In [14]:
training_args = TrainingArguments(
    **train_args
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=custom_metrics,

)

trainer.train()

Step,Training Loss,Validation Loss


  metric1 = load_metric(args["SCRIPTDIR"] + 'starters_kit/metric/' + "precision.py", trust_remote_code=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=2, training_loss=1.6755812168121338, metrics={'train_runtime': 1.3719, 'train_samples_per_second': 5.831, 'train_steps_per_second': 1.458, 'total_flos': 283676794272.0, 'train_loss': 1.6755812168121338, 'epoch': 0.0})