In [5]:
from google.colab import drive 
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [6]:
#step 1: download everything
!pip install transformers==4.28.0
!pip install seqeval
!pip install -q datasets
!pip install -q evaluate


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
#step 1.1 import everything
import seqeval
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import json
import glob
import numpy as np
from datasets import load_dataset, Dataset, concatenate_datasets, DatasetDict
import evaluate

In [8]:
language = "en"

In [9]:
import os 
BASE_DIR = "/content/drive/MyDrive/NLP/multiged-2023-main/"
if language=='en':
  trainData_en_path = os.path.join(BASE_DIR,"english/en_fce_train.tsv")
  devData_en_path = os.path.join(BASE_DIR,"english/en_fce_dev.tsv")
  testData_en_path = os.path.join(BASE_DIR,"english/en_fce_test_unlabelled.tsv")
elif language =="it":
  trainData_en_path = os.path.join(BASE_DIR,"italian/it_merlin_train.tsv")
  devData_en_path = os.path.join(BASE_DIR,"italian/it_merlin_dev.tsv")
  testData_en_path = os.path.join(BASE_DIR,"italian/it_merlin_test_unlabelled.tsv")


In [10]:
#step 1.2 is to upload the files
#I cloned the entire repo and manually uploaded to Colab
#then we open the files up
#I don't use the fce german files bc they aren't binary, I only use falko for german

In [11]:
#step 1.3 use glob to gather all file names
#We are using the dev data as 
train_file_list = glob.glob(trainData_en_path)
test_file_list = glob.glob(devData_en_path)
print("TRAIN FILES:", train_file_list)
print("TEST FILES:", test_file_list)

TRAIN FILES: ['/content/drive/MyDrive/NLP/multiged-2023-main/english/en_fce_train.tsv']
TEST FILES: ['/content/drive/MyDrive/NLP/multiged-2023-main/english/en_fce_dev.tsv']


In [12]:
#These dictionaries show the correspondence between the numerical label and the real label
label2id = {"c": 1, "i": 0}
id2label = {1: "c", 0: "i"}

Transformer models make contextual embeddings based on sentence input. Because of this, we need to split the data into sentences, where each token in a sentence has a label attached. This is easy to do because the files we are working with have newlines in between sentences.

In [27]:
def get_sentences(fin, language):
  """returns a list of sentences given a file"""

  sentences = []
  sentence = []
  with open(fin, "r") as inf:
    lang = language
    for line in inf:
      if line == "\n":
        #if we reach a newline, append the sentence to list and start over
        sentences.append(sentence)
        sentence = []
      else:
        #if we aren't at a newline, split into token and label and add to sent
        line = line.strip("\n")
        token, label = line.split("\t")
        token_label = (token, label)
        sentence.append(token_label)
        
  return lang, sentences


For a multilingual experiment, we are going to combine all the languages together. I'm not actually using this function for this implementation (I just tested on Italian)

In [14]:
def list_to_dataset(lang, sent_list, test=False):
  """This function takes a list of lines with each token paired with its label in a tuple
  It returns a transformers dataset
  """
  data_dicts = []
  #we need an id value for a huggingface dataset
  id = 0
  for sent in sent_list:
    sent_dict = {}
    sent_dict['id'] = id
    sent_dict['lang'] = lang
    id += 1

    tokens = [s[0] for s in sent]
    labels = [s[1] for s in sent]

    #convert "c" and "i" labels to numerical ones
    nums = [label2id[t] for t in labels]
    sent_dict["tokens"] = tokens
    sent_dict["labels"] = nums
    #append sentence dict to list
    data_dicts.append(sent_dict)

  #convert list of dictionaries into Dataset
  combined_dataset = Dataset.from_list(data_dicts)

  #this part is broken for test I think
  if not test:
    dataset = combined_dataset.train_test_split(test_size=0.2, shuffle=True, seed=34)
    dataset["dev"] = dataset["test"]
    del dataset["test"]

  return dataset

In [15]:
def combine_datasets(train_list, test_list):
    """
    This function reads in each file as a dataset individually with the language recorded.
    Then it concatenates all the datasets. The dev set for the dataset is a random 20% split from the training set.
    """

    train_ds_list = []
    dev_ds_list =[]
    for fin in train_list:
        lang, sents = get_sentences(fin, language)
        train_ds = list_to_dataset(lang, sents)
        train_ds_list.append(train_ds)

    for fin in test_list:
        lang, sents = get_sentences(fin, language)
        dev_ds = list_to_dataset(lang, sents)
        dev_ds_list.append(dev_ds)    

    trains = concatenate_datasets([d["train"] for d in train_ds_list]).shuffle(seed=420)
    devs = concatenate_datasets([d["dev"] for d in train_ds_list]).shuffle(seed=420)

    full_train_ds = {
        "train": trains,
        "dev": devs
    }

    return DatasetDict(full_train_ds)

In [16]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [34]:
def create_files_from_dataset(dev_set):
  """
  create ref and hyp files for eval script
  this will write files to the colab directory, you can download them and then run eval.py
  this takes the language and writes each language to a diff file
  """
  predictions = trainer.predict(dev_set)
  true_predictions, true_labels = return_predictions(predictions)

  lang = dev_set[0]["lang"]
  hyp_file_path = os.path.join(BASE_DIR, "outputs", f"{lang}_output_hyp.tsv")
  ref_file_path = os.path.join(BASE_DIR, "outputs", f"{lang}_output_ref.tsv")
  hyp_file = open(hyp_file_path, "w", encoding="utf-8")
  ref_file = open(ref_file_path, "w", encoding="utf-8")

  for i in range(len(dev_set)):
    tokens = dev_set[i]["tokens"]
    labs = true_labels[i]
    preds = true_predictions[i]

    for j in range(len(tokens)):
      tok = tokens[j]
      p = preds[j]
      l = labs[j]
      ref_file.write(tok + "\t" + l + "\n")
      hyp_file.write(tok + "\t" + p + "\n")

    ref_file.write("\n")
    hyp_file.write("\n")

  hyp_file.close()
  ref_file.close()

  return


In [18]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [19]:
def return_predictions(p):
    predictions, labels, metrics = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    return true_predictions, true_labels

Now that the functions are done, here is the actual experiment

In [20]:
label_list = ["i","c"]

In [21]:
if language=="en":
  tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #replace with mono
else: 
  tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-italian-xxl-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [22]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [28]:
dataset = combine_datasets(train_file_list, test_file_list)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/22685 [00:00<?, ? examples/s]

Map:   0%|          | 0/5672 [00:00<?, ? examples/s]

In [29]:
if language=='en':
  model = AutoModelForTokenClassification.from_pretrained(
      "bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)
if language=="it":
  model = AutoModelForTokenClassification.from_pretrained(
    "dbmdz/bert-base-italian-uncased", num_labels=2, id2label=id2label, label2id=label2id)


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [30]:
training_args = TrainingArguments(
    output_dir="test_model",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.02,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["dev"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [32]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.224,0.212167,0.0,0.0,0.0,0.924567
2,0.1885,0.213995,0.0,0.0,0.0,0.927468
3,0.163,0.216898,0.0,0.0,0.0,0.927424
4,0.1402,0.231513,0.0,0.0,0.0,0.927402
5,0.1201,0.252804,0.0,0.0,0.0,0.926995
6,0.1039,0.259608,0.0,0.0,0.0,0.925545
7,0.0902,0.279235,0.0,0.0,0.0,0.925633
8,0.0809,0.299698,0.0,0.0,0.0,0.926204
9,0.0748,0.317552,0.0,0.0,0.0,0.926182
10,0.0704,0.322763,0.0,0.0,0.0,0.926281


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / 

TrainOutput(global_step=14180, training_loss=0.1255946343641187, metrics={'train_runtime': 926.055, 'train_samples_per_second': 244.964, 'train_steps_per_second': 15.312, 'total_flos': 5029841615348964.0, 'train_loss': 0.1255946343641187, 'epoch': 10.0})

In [35]:
create_files_from_dataset(tokenized_dataset["dev"])