In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig, Trainer, TrainingArguments

import time
from sklearn.model_selection import train_test_split
from transformers_interpret import SequenceClassificationExplainer
import torch
from typing import Dict, List, Optional, Tuple, Union
from transformers import PreTrainedModel, PreTrainedTokenizer

args = {}
args['SCRIPTDIR'] = ''
args['MODEL_DIR'] = ''
args['MODEL_NAME'] = 'xlm-roberta-base'
args['DATA_PATH'] = 'data/raw/'
args['DATA_NAME'] = 'exalt_emotion_train.tsv'

args['learning_rate'] = 4e-5
args['per_device_train_batch_size'] = 4
args['per_device_eval_batch_size'] = 4
args['num_train_epochs'] = 1
args['max_steps'] = 2
args['weight_decay'] = 0.01
args['evaluation_strategy'] = 'steps'
args['eval_steps'] = 2
args['save_strategy'] = 'no'
args['gradient_accumulation_steps'] = 1
args['bf16'] = 'false'
args['trainable'] = 'output'
args['deepspeed'] = ''

args['save_steps'] = -1

SAVE_DIR = args["SCRIPTDIR"] + 'resultT/' + \
    (
        args["MODEL_NAME"] +'='+ \
        args["DATA_NAME"][:-4] + '=' + \
        args["trainable"] + 'PT=' + \
        f'lr{args["learning_rate"]:0.0e}='.replace('-0','') + \
        f'bs{args["per_device_train_batch_size"]*8*args["gradient_accumulation_steps"]}=' + \
        ('bf16' if args["bf16"].lower()=='true' else 'fp16') + '=' + \
        time.strftime('%m%d-%H%M%S', time.localtime())
    )

train_args = dict(
    output_dir = SAVE_DIR,
    learning_rate = args["learning_rate"],
    per_device_train_batch_size = args["per_device_train_batch_size"],
    per_device_eval_batch_size = args["per_device_eval_batch_size"],
    num_train_epochs = args["num_train_epochs"],
    max_steps = args["max_steps"],
    weight_decay = args["weight_decay"],
    evaluation_strategy = args["evaluation_strategy"],
    eval_steps = args["eval_steps"],
    save_strategy = args["save_strategy"],
    # load_best_model_at_end=True,
    # save_total_limit=2,
    overwrite_output_dir=True,

    gradient_accumulation_steps = args["gradient_accumulation_steps"],
    logging_strategy = 'epoch',
    bf16 = args["bf16"].lower()=='true',
)

if len(args["deepspeed"]) > 0 :
    train_args['deepspeed'] = args["deepspeed"]
if args["save_steps"]>=0:
    train_args['save_steps'] = args["save_steps"]

MODEL_NAME = 'xlm-roberta-base'


  from .autonotebook import tqdm as notebook_tqdm



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.7/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /usr/local/lib/python3.9/dist-packages/bitsandbytes/libbitsandbytes_cuda117.so...


  warn(msg)


[2024-05-19 13:07:16,914] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
import torch
from typing import Dict, List, Optional, Tuple, Union
from transformers import PreTrainedModel, PreTrainedTokenizer

# to run this on the entire dataset
import pandas as pd
import os

# pick the model you want to use
from datasets import Dataset, DatasetDict, load_metric
traindata = pd.read_csv(args["SCRIPTDIR"] + "data/raw/exalt_triggers_train_token.tsv", sep="\t")
traindata['Labels'] = traindata['Labels'].apply(eval)
traindata['tokens'] = traindata['tokens'].apply(eval)
traindf, devdf = train_test_split(traindata, test_size=0.1, random_state=42)

testdf = pd.read_csv(args["SCRIPTDIR"] + "data/raw/exalt_triggers_dev_participants_token.tsv", sep="\t")
testdf['tokens'] = testdf['tokens'].apply(eval)

datasets = DatasetDict({
    "train": Dataset.from_pandas(traindf),
    "dev" : Dataset.from_pandas(devdf)
    })
datasets2 = DatasetDict({
    "test": Dataset.from_pandas(testdf)
    })

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []

    for i, label in enumerate(examples["Labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs
def tokenize_and_align_labels2(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    tokenized_inputs["Labels"] = tokenized_inputs.word_ids()
    return tokenized_inputs

tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)
tokenized_datasets2 = datasets2.map(tokenize_and_align_labels2)


                                                                 

In [5]:
# load the tokenizer and model
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME,num_labels=2).to('cuda')

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

def compute_metrics(eval_pred,ep=[1]):
    metric1 = load_metric(args.SCRIPTDIR + 'starters_kit/metric/' + "precision.py", trust_remote_code=True)
    metric2 = load_metric(args.SCRIPTDIR + 'starters_kit/metric/' + "recall.py", trust_remote_code=True)
    metric3 = load_metric(args.SCRIPTDIR + 'starters_kit/metric/' + "f1.py", trust_remote_code=True)
    metric4 = load_metric(args.SCRIPTDIR + 'starters_kit/metric/' + "accuracy.py", trust_remote_code=True)
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=2)

    true_predictions = [
        p for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label) if l != -100
    ]
    true_labels = [
        l for prediction, label in zip(predictions, labels) for (p, l) in zip(prediction, label) if l != -100
    ]


    precision = metric1.compute(predictions=true_predictions, references=true_labels, average="macro")["precision"]
    recall = metric2.compute(predictions=true_predictions, references=true_labels, average="macro")["recall"]
    f1 = metric3.compute(predictions=true_predictions, references=true_labels, average="macro")["f1"]
    accuracy = metric4.compute(predictions=true_predictions, references=true_labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at /work/share/public/weights/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args = TrainingArguments(
    **train_args
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

resultss = trainer.predict(tokenized_datasets2["test"])


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [8]:
import numpy as np

def softmax(z):
    hat_z = [0 for i in range(len(z))]
    for i in range(len(z)):
        hat_z[i] = [ [(np.exp(k[0]))/(np.exp(k[0])+np.exp(k[1])),np.exp(k[1])/(np.exp(k[0])+np.exp(k[1]))] for k in z[i]]
    return hat_z

# map sub-tokens to token, numerical output
resultN = resultss.predictions.tolist()
aveN = []
for output,ids in zip(resultN,tokenized_datasets2["test"]['Labels']):
    temp = [0 for i in range(ids[-2]+1)]
    for i,n in enumerate(ids):
        if n is not None:
            temp[n] = max(temp[n],output[i][1])
    aveN.append(temp)

# map sub-tokens to token, binary output
resultB = softmax(resultss.predictions)
aveB = []
for output,ids in zip(resultB,tokenized_datasets2["test"]['Labels']):
    temp = [0 for i in range(ids[-2]+1)]
    for i,n in enumerate(ids):
        if n is not None:
            temp[n] = max(temp[n],output[i][1])
    aveB.append(temp)

for i in range(len(aveB)):
    if len(testdf['tokens'][i])!=len(aveB[i]):
        print(i)
        print(testdf['tokens'][i][-10:])
        aveB[i]+=[0.00000]
        aveN[i]+=[0.00000]

testdf = testdf.drop(columns='tokens')
import os
os.makedirs(SAVE_DIR,exist_ok=True)
testdf["Labels"] = aveN
print(testdf.iloc[0])
# testdf.to_csv(SAVE_DIR + "/NumericalTriggers.tsv", sep="\t", index=False)
avedB = [[int(j>0.1) for j in i] for i in aveB ]
testdf["Labels"] = avedB
print(testdf.iloc[0])
# testdf.to_csv(SAVE_DIR + "/BinaryTriggers.tsv", sep="\t", index=False)



73
['this', 'season', '👨', '🏼', '👨', '🏻', '👩', '🏻', '@user', '']
291
['.', 'Смотреть', '!', 'Юмор', '.', 'http', 'с', 'помощью', '@user', '']
ID                                                     8133
Texts     There is something wrong with your mind . Nobo...
Labels        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: 0, dtype: object
ID                                                     8133
Texts     There is something wrong with your mind . Nobo...
Labels        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Name: 0, dtype: object
