In [None]:
!pip install datasets transformers
!apt install git-lfs


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# **Utility Functions**

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset, load_metric
import datasets as ds
from pynvml import *
import numpy as np
from transformers import AutoTokenizer

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    if metric_name == "roc_auc":
      return metric.compute(prediction_scores=predictions, references=labels)
    if metric_name == "accuracy":
      return metric.compute(predictions=predictions, references=labels)

def eval_language(trainer,language):
    datasets = load_dataset("paws-x",language,split="test")
    tokenized_datasets = datasets.map(preprocess_function, batched=True)
    evaluation = trainer.evaluate(tokenized_datasets)
    if metric_name == "accuracy":
      return evaluation["eval_accuracy"]
    if metric_name == "roc_auc":
      return evaluation["eval_roc_auc"]


def preprocess_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"],padding = True, truncation=True)

# **Parameters for PAWS training**

In [None]:
model_checkpoint = "/content/gdrive/MyDrive/mnli"
#model_checkpoint = "distilbert-base-multilingual-cased"
metric_name = "accuracy"
metric = load_metric(metric_name)
batch_size = 64
num_labels = 2
languages = ['en', 'de', 'es', 'fr', 'ja', 'ko', 'zh']
# datasets = [load_dataset("paws-x",lan)["train"] for lan in categories]
# ds3 = ds.concatenate_datasets(datasets)
# datasets = ds.DatasetDict({"train":ds3})
#valtest_ds = ds3.train_test_split(test_size=0.2)
#datasets = load_dataset("paws","labeled_final")
datasets = load_dataset("paws-x","en")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenized_datasets = datasets.map(preprocess_function,batched=True)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Reusing dataset pawsx (/root/.cache/huggingface/datasets/pawsx/en/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755)


  0%|          | 0/3 [00:00<?, ?it/s]

ValueError: ignored

# **Training Configuration**

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer,AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels,ignore_mismatched_sizes=True).to("cuda")
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-PAWS",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16 = True
)

loading configuration file /content/gdrive/MyDrive/mnli/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/gdrive/MyDrive/mnli",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "vocab_size": 119547
}

loading weights file /content/gdrive/MyDrive/mnli/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForSequenceClassification.

Some weights of DistilBertForSequenceClassification were not initialized from the model checkp

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("pawx-train-en")

Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, id, sentence1. If sentence2, id, sentence1 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 49401
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 8
  Total optimization steps = 288


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.489619,0.771
1,No log,0.38231,0.832
2,No log,0.370069,0.8445


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, id, sentence1. If sentence2, id, sentence1 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to mnli-finetuned-PAWS/checkpoint-96
Configuration saved in mnli-finetuned-PAWS/checkpoint-96/config.json
Model weights saved in mnli-finetuned-PAWS/checkpoint-96/pytorch_model.bin
tokenizer config file saved in mnli-finetuned-PAWS/checkpoint-96/tokenizer_config.json
Special tokens file saved in mnli-finetuned-PAWS/checkpoint-96/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, id, sentence1. If sentence2, id, sentence1 are not expected by `DistilBertF

In [None]:
!cp -r '/content/pawx-train-en' '/content/gdrive/MyDrive/thesis/trained_intermediate/mnlien_pawsen_pawsxen'

# **Evaluation of the model**

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer,AutoModelForSequenceClassification,AutoTokenizer

model_checkpoints = ['/content/gdrive/MyDrive/thesis/trained_intermediate/mnlien_pawsen_pawsxen']
languages = ['en', 'de', 'es', 'fr', 'ja', 'ko', 'zh']

results = {}
for model_checkpoint in model_checkpoints:
  results[model_checkpoint] = {}
  model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  trainer = Trainer(
      model,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )
  for language in languages:
    result = eval_language(trainer,language)
    results[model_checkpoint][language] = result



loading configuration file /content/gdrive/MyDrive/thesis/trained_intermediate/mnlien_pawsen_pawsxen/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/gdrive/MyDrive/thesis/trained_intermediate/mnlien_pawsen_pawsxen",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.19.2",
  "vocab_size": 119547
}

loading weights file /content/gdrive/MyDrive/thesis/trained_intermediate/mnlien_pawsen_pawsxen/pytorch_model.bin
All model checkpoint weights were used when initializi

Downloading and preparing dataset pawsx/de (download: 28.88 MiB, generated: 13.20 MiB, post-processed: Unknown size, total: 42.08 MiB) to /root/.cache/huggingface/datasets/pawsx/de/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755...


Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset pawsx downloaded and prepared to /root/.cache/huggingface/datasets/pawsx/de/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, id, sentence1. If sentence2, id, sentence1 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


Downloading and preparing dataset pawsx/es (download: 28.88 MiB, generated: 13.20 MiB, post-processed: Unknown size, total: 42.08 MiB) to /root/.cache/huggingface/datasets/pawsx/es/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755...


Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset pawsx downloaded and prepared to /root/.cache/huggingface/datasets/pawsx/es/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, id, sentence1. If sentence2, id, sentence1 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


Downloading and preparing dataset pawsx/fr (download: 28.88 MiB, generated: 13.70 MiB, post-processed: Unknown size, total: 42.58 MiB) to /root/.cache/huggingface/datasets/pawsx/fr/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755...


Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset pawsx downloaded and prepared to /root/.cache/huggingface/datasets/pawsx/fr/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, id, sentence1. If sentence2, id, sentence1 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


Downloading and preparing dataset pawsx/ja (download: 28.88 MiB, generated: 15.61 MiB, post-processed: Unknown size, total: 44.49 MiB) to /root/.cache/huggingface/datasets/pawsx/ja/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755...


Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset pawsx downloaded and prepared to /root/.cache/huggingface/datasets/pawsx/ja/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, id, sentence1. If sentence2, id, sentence1 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


Downloading and preparing dataset pawsx/ko (download: 28.88 MiB, generated: 14.35 MiB, post-processed: Unknown size, total: 43.23 MiB) to /root/.cache/huggingface/datasets/pawsx/ko/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755...


Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset pawsx downloaded and prepared to /root/.cache/huggingface/datasets/pawsx/ko/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, id, sentence1. If sentence2, id, sentence1 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


Downloading and preparing dataset pawsx/zh (download: 28.88 MiB, generated: 11.22 MiB, post-processed: Unknown size, total: 40.10 MiB) to /root/.cache/huggingface/datasets/pawsx/zh/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755...


Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset pawsx downloaded and prepared to /root/.cache/huggingface/datasets/pawsx/zh/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: sentence2, id, sentence1. If sentence2, id, sentence1 are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 8


# **Evaluation Scores**
# Accuracy
| Model / Language   | en       | de       | es       | fr       | ja       | ko       | zh       |
|--------------------|----------|----------|----------|----------|----------|----------|----------|
| Baseline Model     | 0.84     | 0.74     | 0.73     | 0.75     | 0.63     | 0.61     | 0.67     |
| Intermediate XQuAD | 0.88     | **0.78** | **0.81** | **0.81** | **0.69** | **0.68** | **0.74** |
| Intermediate XNLI  | **0.89** | 0.75     | 0.76     | 0.78     | 0.57     | 0.55     | 0.65     |

# AUC
| Model / Language   | en       | de       | es       | fr       | ja       | ko       | zh       |
|--------------------|----------|----------|----------|----------|----------|----------|----------|
| Baseline Model     | 0.84     | 0.72     | 0.73     | 0.75     | 0.59     | 0.57     | 0.65     |
| Intermediate XQuAD | 0.89     | **0.78** | **0.81** | **0.81** | **0.67** | **0.65** | **0.73** |
| Intermediate XNLI  | **0.89** | 0.74     | 0.77     | 0.78     | 0.58     | 0.56     | 0.65     |