In [None]:
!pip install datasets transformers
!apt install git-lfs


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# **Utility Functions**

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset, load_metric
import datasets as ds
from pynvml import *
import numpy as np
from transformers import AutoTokenizer

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    if metric_name == "roc_auc":
      return metric.compute(prediction_scores=predictions, references=labels)
    if metric_name == "accuracy":
      return metric.compute(predictions=predictions, references=labels)

def eval_language(trainer,language):
    dataset = datasets["test."+language]
    tokenized_datasets = dataset.map(preprocess_function, batched=True)
    tokenized_datasets = tokenized_datasets.rename_column("news_category", "labels")
    evaluation = trainer.evaluate(tokenized_datasets)
    if metric_name == "accuracy":
      return evaluation["eval_accuracy"]
    if metric_name == "roc_auc":
      return evaluation["eval_roc_auc"]


def preprocess_function(examples):
    return tokenizer(examples["news_title"], examples["news_body"],padding = True, truncation=True)

# **Training Parameters for News Classification Task**

In [None]:
model_checkpoint = "/content/gdrive/MyDrive/thesis/baseline_intermediate/mnli"
metric_name = "accuracy"
metric = load_metric(metric_name)
batch_size = 32
num_labels = 10
languages = ['en', 'de', 'es', 'fr', 'ja', 'ko', 'zh']
datasets = load_dataset("xglue","nc")

Downloading and preparing dataset x_glue/nc (download: 835.33 MiB, generated: 583.60 MiB, post-processed: Unknown size, total: 1.39 GiB) to /root/.cache/huggingface/datasets/x_glue/nc/1.0.0/8566eedecd9ab28e01c051c023dadf97bf408e5195f76b06aba70ebd4697ae08...


Downloading data:   0%|          | 0.00/876M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Generating validation.en split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation.de split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation.es split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation.fr split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation.ru split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test.en split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test.de split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test.es split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test.fr split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test.ru split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset x_glue downloaded and prepared to /root/.cache/huggingface/datasets/x_glue/nc/1.0.0/8566eedecd9ab28e01c051c023dadf97bf408e5195f76b06aba70ebd4697ae08. Subsequent calls will reuse this data.


  0%|          | 0/11 [00:00<?, ?it/s]

# **Training Configuration**

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer,AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels,ignore_mismatched_sizes=True).to("cuda")
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-NC",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16 = True
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /content/gdrive/MyDrive/thesis/baseline_intermediate/mnli and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([10]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation.en"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("nc-train-en")

Using amp half precision backend
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: news_title, news_body. If news_title, news_body are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 8
  Total optimization steps = 1170


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.316425,0.894
1,0.651800,0.275076,0.9096
2,0.401700,0.251103,0.9192


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: news_title, news_body. If news_title, news_body are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 32
Saving model checkpoint to mnli-finetuned-NC/checkpoint-390
Configuration saved in mnli-finetuned-NC/checkpoint-390/config.json
Model weights saved in mnli-finetuned-NC/checkpoint-390/pytorch_model.bin
tokenizer config file saved in mnli-finetuned-NC/checkpoint-390/tokenizer_config.json
Special tokens file saved in mnli-finetuned-NC/checkpoint-390/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: news_title, news_body. If news_title, news_body are not expected by `DistilBertForSequenceClassifi

In [None]:
!cp -r '/content/nc-train-en' '/content/gdrive/MyDrive/thesis/trained_intermediate/mnli_nc'

# **Evaluation of the model**

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer,AutoModelForSequenceClassification,AutoTokenizer

model_checkpoints = ["/content/gdrive/MyDrive/thesis/baseline_downstream/ncen"
                    ]
languages = ['en', 'de', 'es', 'fr', 'ru']

results = {}
for model_checkpoint in model_checkpoints:
  results[model_checkpoint] = {}
  model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
  tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
  trainer = Trainer(
      model,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )
  for language in languages:
    result = eval_language(trainer,language)
    results[model_checkpoint][language] = result
6


loading configuration file /content/gdrive/MyDrive/thesis/baseline_downstream/ncen/config.json
Model config DistilBertConfig {
  "_name_or_path": "/content/gdrive/MyDrive/thesis/baseline_downstream/ncen",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8": 8,
    "LABEL_9": 9
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_labe

  0%|          | 0/10 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: news_title, news_body. If news_title, news_body are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8


  0%|          | 0/10 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: news_title, news_body. If news_title, news_body are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8


  0%|          | 0/10 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: news_title, news_body. If news_title, news_body are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8


  0%|          | 0/10 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: news_title, news_body. If news_title, news_body are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8


  0%|          | 0/10 [00:00<?, ?ba/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: news_title, news_body. If news_title, news_body are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 8


6