In [None]:
from google.colab import drive

# mount your google drive
drive.mount('/content/drive')

path = "/content/drive/My Drive/TFG/Exist_2023/"

In [None]:
TASK = "b" # "a"

model_name = "bert-base-cased"
#model_name = 'bert-base-multilingual-uncased'
#model_name = 'bert-base-multilingual-cased'
#model_name = "bert-base-uncased"
#model_name = "roberta-base"
#model_name = "cardiffnlp/twitter-roberta-base-emotion"
#model_name = "xlm-roberta-base"
#model_name = "xlm-roberta-large"


CLASS_FIELD = None

# sexism clasification
if TASK == "a":
    NUM_CLASSES = 2
    CLASS_FIELD = "task1"
    remove_columns=['id_tweet','lang','task2', 'size',  'IDEOLOGICAL_INEQUALITY', 'STEREOTYPING_DOMINANCE', 'OBJECTIFICATION', 'MISOGYNY_NON_SEXUALVIOLENCE', 'UNKNOWN', 'NONE', 'SEXUAL_VIOLENCE']


# intention clasification
elif TASK == "b":
    NUM_CLASSES = 3
    CLASS_FIELD = "task2"
    remove_columns=['id_tweet','lang','task1', 'size', 'IDEOLOGICAL_INEQUALITY', 'STEREOTYPING_DOMINANCE', 'OBJECTIFICATION', 'MISOGYNY_NON_SEXUALVIOLENCE', 'UNKNOWN', 'NONE', 'SEXUAL_VIOLENCE']



print("Classification: ", CLASS_FIELD, ", number of classes: ", NUM_CLASSES)

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
!pip install datasets transformers

In [None]:
pip install --upgrade accelerate

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files=path+"train.csv")
val_set = load_dataset("csv", data_files=path+"validation.csv")
test_set = load_dataset("csv", data_files=path+"test.csv")



dataset['validation'] = val_set.pop("train")
dataset['test'] = test_set.pop("train")
dataset

In [None]:
dataset=dataset.remove_columns(remove_columns)
dataset

In [None]:
#from every split (test, train, val), if value in task2 column is not "-" then include, else exclude
if TASK!='a':
    # remove rows
    for split in dataset.keys():
        dataset[split] = dataset[split].select(
            (
                i for i in range(len(dataset[split]))
                    if dataset[split][i][CLASS_FIELD] != '-'
            )
        )

dataset

In [None]:
LABELS = sorted(set(dataset["train"][CLASS_FIELD]))
print(LABELS)

In [None]:
from sklearn.preprocessing import LabelEncoder

print('CLASS_FIELD:', CLASS_FIELD)
le = LabelEncoder()

Y_train=dataset["train"][CLASS_FIELD]

y_train = le.fit_transform(Y_train)


Y_val=dataset["validation"][CLASS_FIELD]
y_val = le.transform(Y_val)

Y_test=dataset["test"][CLASS_FIELD]
y_test = le.transform(Y_test)



try:
    dataset['train'] = dataset['train'].add_column("label", y_train)
    dataset['validation'] = dataset['validation'].add_column("label", y_val)
    dataset['test'] = dataset['test'].add_column("label", y_test)
except:
    pass

dataset

In [None]:
dataset=dataset.remove_columns(CLASS_FIELD)
dataset

In [None]:
from transformers import set_seed, BertTokenizerFast, AutoTokenizer, XLMRobertaXLModel


set_seed(42)

# load the tokenizer
if 'uncased' in model_name:
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True, use_fast=True, normalization=True)
    # tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, normalization=True)
    # tokenizer = BertTokenizerFast.from_pretrained(model_name)

In [None]:
# max sequence length for each document/sentence sample
MAX_LENGTH = max([len(tokenizer(text).tokens())  for text in dataset['train']['tweet']])
# print(MAX_LENGTH)

MAX_LENGTH = min(MAX_LENGTH, 512)
print('MAX_LENGTH:', MAX_LENGTH)


In [None]:
def tokenize_func(example):
    return tokenizer(example["tweet"], truncation=True, padding='max_length',max_length=MAX_LENGTH)

encoded_dataset = dataset.map(tokenize_func, batched=True)
encoded_dataset

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, BertForMultipleChoice, XLMRobertaForSequenceClassification, RobertaForSequenceClassification, XLMRobertaXLModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import tensorflow as tf

# defines the model and pass to CUDA

# model for roberta based models (roberta base, twitter roberta, xlm roberta)
# model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=NUM_CLASSES, ignore_mismatched_sizes=True).to("cuda")

# model for berta based models (berta base cased/uncased, multilingual)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3).to("cuda")

# define arguments
args = TrainingArguments(
    output_dir='./outputs/',
    logging_dir='./logs',            # directory for storing logs

    num_train_epochs=1, # 3, we changed to 1 for a faster training. You should increase its value to 3 or 5
    evaluation_strategy = "epoch",  # "steps",   evaluate each `logging_steps`, logging_steps=400,               # log & save weights each logging_steps     save_steps=400,
                                    # save_steps=400,
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.05,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)


In [None]:
# define metrics
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,    #we could return just the accuracy
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from datasets import concatenate_datasets
encoded_train=encoded_dataset["train"]
encoded_val=encoded_dataset["validation"]
encoded_test=encoded_dataset["test"]

encoded_train

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,  # the model
    args,   # the arguments of the model
    train_dataset=encoded_train, # the training dataset
    eval_dataset=encoded_val, #the validation dataset
    tokenizer=tokenizer,    # the tokenizer
    compute_metrics=compute_metrics, # the metrics for obtain the metrics on the evaluation
    #compute_metrics=evaluate.load("accuracy"),    # metrics are calculated for each epoch
)

# type(trainer.data_collator)   #transformers.data.data_collator.DataCollatorWithPadding

# training
trainer.train()

In [None]:
import os
SAVE_MODEL = True
if SAVE_MODEL:
    models_dir = path+'/results'
    if not os.path.exists(models_dir): ### If the file directory doesn't already exists,
        os.makedirs(models_dir) ### Make it please

    model_path = models_dir+model_name+"_{}".format(TASK)
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

In [None]:
# evaluate the current model after training
trainer.evaluate()

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, truncation=True, padding='max_length', max_length=MAX_LENGTH, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    # return probs.argmax() is a tensor. We have to return its item
    return probs.argmax().item()

In [None]:
y_pred=[get_prediction(text) for text in dataset['test']['tweet']]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true=dataset['test']['label'], y_pred=y_pred, target_names=LABELS))
cm = confusion_matrix(dataset['test']['label'], y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=LABELS)
disp.plot()