In [None]:
!pip install datasets transformers seqeval
!apt install git-lfs


Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.3.4-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 42 not upgraded.


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# **Utility Functions**

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset, load_metric
import datasets as ds
from pynvml import *
import numpy as np
from transformers import AutoTokenizer

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
    [p for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
    "precision": results["overall_precision"],
    "recall": results["overall_recall"],
    "f1": results["overall_f1"],
    "accuracy": results["overall_accuracy"],
    }

def eval_language(trainer,language):
    dataset = datasets["test."+language]
    tokenized_datasets = dataset.map(preprocess_function, batched=True)
    evaluation = trainer.evaluate(tokenized_datasets)
    print(evaluation)
    return evaluation["eval_accuracy"]


def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True,padding=True)
    labels = []
    for i, label in enumerate(examples[task_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Convert the None values attributed to special tokens to -100 
            if word_idx is None:
                label_ids.append(-100)
            # For every word token, we only mark the first, then the other are given -100
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# **Parameters for POS/NER training**

In [None]:
#model_checkpoint = "/content/gdrive/MyDrive/intermediate_xquad_pawsen"
task_name = "pos"
model_checkpoint = "/content/gdrive/MyDrive/thesis/baseline_intermediate/xquad"
metric_name = "seqeval"
metric = load_metric(metric_name)
batch_size = 32
num_labels = 2
languages = ['en', 'de', 'es', 'fr', 'ja', 'ko', 'zh']
ner_tags = {'LOC':0, 'O':1, 'ORG':2, 'PER':3}
datasets = load_dataset("xglue",task_name)
#datasets = ds.load_from_disk("/content/gdrive/MyDrive/nerdataset")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#tokenized_datasets = datasets.map(preprocess_function, batched=True)
num_labels = max([tag for instance in datasets["train"] for tag in instance["pos"]]) + 1

Reusing dataset x_glue (/root/.cache/huggingface/datasets/x_glue/pos/1.0.0/8566eedecd9ab28e01c051c023dadf97bf408e5195f76b06aba70ebd4697ae08)


  0%|          | 0/35 [00:00<?, ?it/s]

Didn't find file /content/gdrive/MyDrive/thesis/baseline_intermediate/xquad/added_tokens.json. We won't load it.
loading file /content/gdrive/MyDrive/thesis/baseline_intermediate/xquad/vocab.txt
loading file /content/gdrive/MyDrive/thesis/baseline_intermediate/xquad/tokenizer.json
loading file None
loading file /content/gdrive/MyDrive/thesis/baseline_intermediate/xquad/special_tokens_map.json
loading file /content/gdrive/MyDrive/thesis/baseline_intermediate/xquad/tokenizer_config.json


# **Training Configuration**

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer,DataCollatorForTokenClassification


model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels,ignore_mismatched_sizes=True).to("cuda")
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-POSNER",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16 = True
)

Some weights of the model checkpoint at /content/gdrive/MyDrive/thesis/baseline_intermediate/xquad were not used when initializing DistilBertForTokenClassification: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at /content/gdrive/MyDrive/thesis/baseline_intermediate/xquad and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation.en"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = DataCollatorForTokenClassification(tokenizer),
)

trainer.train()
trainer.save_model("pos-train-en")

In [None]:
!cp -r '/content/pos-train-en' '/content/gdrive/MyDrive/thesis/trained_intermediate/xquad_pos'

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation.en"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = DataCollatorForTokenClassification(tokenizer),
)
trainer.evaluate(tokenized_datasets["validation.zh"])

# **Evaluation of the model**

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer,AutoTokenizer

model_checkpoints = ["/content/gdrive/MyDrive/thesis/trained_intermediate/mnli_pos",
                     "/content/gdrive/MyDrive/thesis/trained_intermediate/ner_pos",
                     "/content/gdrive/MyDrive/thesis/trained_intermediate/xnli_pos",
                     "/content/gdrive/MyDrive/thesis/trained_intermediate/xquad_pos",
                     "/content/gdrive/MyDrive/thesis/baseline_downstream/posen"
                     ]
languages =   ["en","de","es","nl","bg","el","fr","pl","tr","vi","zh","ur","hi","it","ar","ru","th"]

results = {}
for model_checkpoint in model_checkpoints:
  results[model_checkpoint] = {}
  model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  trainer = Trainer(
      model,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )
  for language in languages:
    result = eval_language(trainer,language)
    results[model_checkpoint][language] = result

