In [None]:
#step 1: download everything
!pip install transformers==4.28.0
!pip install seqeval
!pip install -q datasets
!pip install -q evaluate


In [None]:
#step 1.1 import everything
import seqeval
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import json
import glob
import numpy as np
from datasets import load_dataset, Dataset, concatenate_datasets, DatasetDict
import evaluate

In [None]:
#step 1.2 is to upload the files
#I cloned the entire repo and manually uploaded to Colab
#then we open the files up
#I don't use the fce german files bc they aren't binary, I only use falko for german

In [None]:
#step 1.3 use glob to gather all file names
#We are using the dev data as 
train_file_list = glob.glob("/content/*train.tsv")
test_file_list = glob.glob("/content/*dev.tsv")
print("TRAIN FILES:", train_file_list)
print("TEST FILES:", test_file_list)

TRAIN FILES: ['/content/cs_geccc_train.tsv', '/content/it_merlin_train.tsv', '/content/de_falko-merlin_train.tsv', '/content/en_fce_train.tsv', '/content/sv_swell_train.tsv']
TEST FILES: ['/content/cs_geccc_dev.tsv', '/content/it_merlin_dev.tsv', '/content/en_realec_dev.tsv', '/content/sv_swell_dev.tsv', '/content/en_fce_dev.tsv', '/content/de_falko-merlin_dev.tsv']


In [None]:
#These dictionaries show the correspondence between the numerical label and the real label
label2id = {"c": 1, "i": 0}
id2label = {1: "c", 0: "i"}

Transformer models make contextual embeddings based on sentence input. Because of this, we need to split the data into sentences, where each token in a sentence has a label attached. This is easy to do because the files we are working with have newlines in between sentences.

In [None]:
def get_sentences(fin):
  """returns a list of sentences given a file"""

  sentences = []
  sentence = []
  with open(fin, "r") as inf:
    lang = fin.split("_")[0].split("/")[2]
    for line in inf:
      if line == "\n":
        #if we reach a newline, append the sentence to list and start over
        sentences.append(sentence)
        sentence = []
      else:
        #if we aren't at a newline, split into token and label and add to sent
        line = line.strip("\n")
        token, label = line.split("\t")
        token_label = (token, label)
        sentence.append(token_label)
        
  return lang, sentences

For a multilingual experiment, we are going to combine all the languages together. I'm not actually using this function for this implementation (I just tested on Italian)

In [None]:
#not used in the multilingual setting
def combine_all_sentences(train_list, test_list):
  """
  reads in all the files and combines them into giant lists of sents
  For monolingual setting we will need to do some extra work here to keep track of languages
  """
  train_sents = []
  for fin in train_list:
    lang, sents = get_sentences(fin)
    train_sents += sents
  
  test_sents = []
  for fin in test_list:
    lang, sents = get_sentences(fin)
    test_sents += sents

  return train_sents, test_sents

In [None]:
def list_to_dataset(lang, sent_list, test=False):
  """This function takes a list of lines with each token paired with its label in a tuple
  It returns a transformers dataset
  """
  data_dicts = []
  #we need an id value for a huggingface dataset
  id = 0
  for sent in sent_list:
    sent_dict = {}
    sent_dict['id'] = id
    sent_dict['lang'] = lang
    id += 1

    tokens = [s[0] for s in sent]
    labels = [s[1] for s in sent]

    #convert "c" and "i" labels to numerical ones
    nums = [label2id[t] for t in labels]
    sent_dict["tokens"] = tokens
    sent_dict["labels"] = nums
    #append sentence dict to list
    data_dicts.append(sent_dict)

  #convert list of dictionaries into Dataset
  combined_dataset = Dataset.from_list(data_dicts)


  if not test:
    dataset = combined_dataset.train_test_split(test_size=0.2, shuffle=True, seed=34)
    dataset["dev"] = dataset["test"]
    del dataset["test"]
  
  else:
    dataset = combined_dataset

  return dataset

In [None]:
def combine_datasets_multiling(train_list, test_list):
  """we read in each file as a dataset individually (with the language recorded)
  then concatenate all the datasets. The test set in the dataset is the dev set
  from the shared task (since the ST test set doesn't have gold labels released
  The dev set for the dataset is a random .2 split from the training ST set."""

  train_ds_list = []
  test_ds_list = []

  for fin in train_list:
    lang, sents = get_sentences(fin)
    train_ds = list_to_dataset(lang, sents)
    train_ds_list.append(train_ds)

  for fin2 in test_list:
    lang, sents = get_sentences(fin2)
    print(lang)
    test_ds = list_to_dataset(lang, sents, test=True)
    test_ds_list.append(test_ds)
  
  
  trains = concatenate_datasets([d["train"] for d in train_ds_list]).shuffle(seed=420)

  devs = concatenate_datasets([d["dev"] for d in train_ds_list]).shuffle(seed=420)

  tests = concatenate_datasets([d for d in test_ds_list]).shuffle(seed=420)
  print(tests[0]["lang"], tests[2000]["lang"])
  print(trains, devs, tests)

  full_train_ds = {
      "train": trains,
      "dev": devs,
      "test": tests
  }



  return DatasetDict(full_train_ds)

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
def create_files_from_dataset(dev_set):
  """
  create ref and hyp files for eval script
  this will write files to the colab directory, you can download them and then run eval.py
  this takes the language and writes each language to a diff file
  """
  predictions = trainer.predict(dev_set)
  true_predictions, true_labels = return_predictions(predictions)
  with open("/content/cs_output_hyp4t.tsv", "w", encoding="utf-8") as cs_hyp:
    with open("/content/en_output_hyp4t.tsv", "w", encoding="utf-8") as en_hyp:
      with open("/content/de_output_hyp4t.tsv", "w", encoding="utf-8") as de_hyp:
        with open("/content/it_output_hyp4t.tsv", "w", encoding="utf-8") as it_hyp:
          with open("/content/sv_output_hyp4t.tsv", "w", encoding="utf-8") as sv_hyp:
            with open("/content/cs_output_ref4t.tsv", "w", encoding="utf-8") as cs_ref:
              with open("/content/en_output_ref4t.tsv", "w", encoding="utf-8") as en_ref:
                with open("/content/de_output_ref4t.tsv", "w", encoding="utf-8") as de_ref:
                  with open("/content/it_output_ref4t.tsv", "w", encoding="utf-8") as it_ref:
                    with open("/content/sv_output_ref4t.tsv", "w", encoding="utf-8") as sv_ref:

                      #hyp files
                      hypes = {"cs": cs_hyp,
                        "en": en_hyp,
                        "de": de_hyp,
                        "it": it_hyp,
                        "sv": sv_hyp}

                      #ref files
                      refs = {"cs": cs_ref,
                      "en": en_ref,
                      "de": de_ref,
                      "it": it_ref,
                      "sv": sv_ref}

                      for i in range(len(dev_set)):
                        lang = dev_set[i]["lang"]
                        hyp_file = hypes[lang]
                        ref_file = refs[lang]
                        tokens = dev_set[i]["tokens"]
                        labs = true_labels[i]
                        preds = true_predictions[i]
                        # print(lang)
                        for j in range(len(tokens)):
                          tok = tokens[j]
                          p = preds[j]
                          l = labs[j]
                          ref_file.write(tok + "\t" + l + "\n")
                          hyp_file.write(tok + "\t" + p + "\n")
                        ref_file.write("\n")
                        hyp_file.write("\n")


  return


In [None]:
def compute_metrics(p):
    #not working exactly as intended, but we're using the eval.py script for evaluation anyway
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
def return_predictions(p):
    """
    returns the predicted and true labels from the test set. 
    Used for writing the hyp and ref files for testing
    """
    predictions, labels, metrics = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    return true_predictions, true_labels

Now that the functions are done, here is the actual experiment

In [None]:
label_list = ["i","c"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base") #change to "bert-base-multilingual-cased" if you want to try mBERT

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) #used for dynamic padding of batches

In [None]:
seqeval = evaluate.load("seqeval")

In [None]:
#here we actually load in the files into a huggingface dataset, and then tokenize the dataset
train_ds = combine_datasets_multiling(train_file_list, test_file_list)
tokenized_dataset = train_ds.map(tokenize_and_align_labels, batched=True)

In [None]:
tokenized_dataset["test"][0]['lang'] #just for taking a look

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base", num_labels=2, id2label=id2label, label2id=label2id
) #change to "bert-base-multilingual-cased" if you want to try mBERT

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-st

In [None]:
#training args for the trainer, modify hyperparams if desired
training_args = TrainingArguments(
    output_dir="test_model",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.02,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
) 

In [None]:
#instantiating the trainer with tokenizer, data collator, and train/dev datasets from above
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
#actually runs the training. Will take several hours depending on GPU usage
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.2391,0.248782,0.0,0.0,0.0,0.91122
2,0.2172,0.228125,0.0,0.0,0.0,0.918907
3,0.1866,0.230955,0.0,0.0,0.0,0.920423
4,0.1602,0.246775,0.0,0.0,0.0,0.921152
5,0.1419,0.269524,0.0,0.0,0.0,0.921777
6,0.1261,0.277462,0.0,0.0,0.0,0.921916
7,0.1126,0.299732,0.0,0.0,0.0,0.9218
8,0.1013,0.302014,0.0,0.0,0.0,0.922118
9,0.0941,0.320503,0.0,0.0,0.0,0.922207
10,0.0884,0.336628,0.0,0.0,0.0,0.922153


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


TrainOutput(global_step=90520, training_loss=0.1493178774548799, metrics={'train_runtime': 6616.3213, 'train_samples_per_second': 109.441, 'train_steps_per_second': 13.681, 'total_flos': 1.5678243280365624e+16, 'train_loss': 0.1493178774548799, 'epoch': 10.0})

In [None]:
create_files_from_dataset(tokenized_dataset["test"]) #creates the hyp and ref files for the test_dataset