In [None]:
!pip install datasets transformers
!apt install git-lfs


Collecting datasets
  Downloading datasets-2.2.0-py3-none-any.whl (342 kB)
[K     |████████████████████████████████| 342 kB 12.5 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 46.4 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 51.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.9 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 41.6 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# **Utility Functions**

In [None]:
from huggingface_hub import notebook_login
from datasets import load_dataset, load_metric
import datasets as ds
from pynvml import *
import numpy as np
from transformers import AutoTokenizer
    
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def preprocess_function(examples):
    return tokenizer(examples["hypothesis"], examples["premise"], truncation=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
print_gpu_utilization()

GPU memory occupied: 8104 MB.


# **Parameters**

In [None]:
#model_checkpoint = "/content/gdrive/MyDrive/train_squad_en"
model_checkpoint = "distilbert-base-multilingual-cased"
batch_size = 64
num_labels = 3
metric = load_metric("accuracy")
# categories = ['en', 'de', 'es', 'fr', 'ja', 'ko', 'zh']
categories = ['en', 'de', 'es', 'fr', 'zh']
# dataset_train = [load_dataset("xnli",lan)["train"] for lan in categories]
# dataset_val = [load_dataset("xnli",lan)["validation"] for lan in categories]
# ds_train = ds.concatenate_datasets(dataset_train)
# ds_val = ds.concatenate_datasets(dataset_val)
# datasets = ds.DatasetDict({"train":ds3})
#valtest_ds = ds3.train_test_split(test_size=0.2)
# dataset_train = load_dataset("xnli","en",split='train[:50%]')
# dataset_val = load_dataset("xnli","en",split='validation[:50%]')
datasets = load_dataset('multi_nli', split=['train[:15%]','validation_matched[:20%]'])
#datasets = ds.DatasetDict({"train":ds_train,"validation":ds_val})
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#tokenized_datasets = datasets.map(preprocess_function, batched=True)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset multi_nli/default (download: 216.34 MiB, generated: 410.92 MiB, post-processed: Unknown size, total: 627.27 MiB) to /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39...


Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Dataset multi_nli downloaded and prepared to /root/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [None]:
train = datasets[0].map(preprocess_function, batched=True)
val = datasets[1].map(preprocess_function, batched=True)

  0%|          | 0/59 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets = train.filter(lambda example: example['label'] != -1)
tokenized_validation = val.filter(lambda example: example['label'] != -1)

  0%|          | 0/59 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

# **Parameter Configuration**

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer,AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to("cuda")
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-XNLI", 
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    push_to_hub=False,
    fp16 = True
)

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.bias', 'pre_class

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.save_model("xnli-train-en")

Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: genre, premise, pairID, premise_binary_parse, promptID, hypothesis_binary_parse, hypothesis_parse, hypothesis, premise_parse. If genre, premise, pairID, premise_binary_parse, promptID, hypothesis_binary_parse, hypothesis_parse, hypothesis, premise_parse are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 58905
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 8
  Total optimization steps = 345


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.812109,0.636271
1,No log,0.761371,0.660214
2,No log,0.753888,0.671931


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: genre, premise, pairID, premise_binary_parse, promptID, hypothesis_binary_parse, hypothesis_parse, hypothesis, premise_parse. If genre, premise, pairID, premise_binary_parse, promptID, hypothesis_binary_parse, hypothesis_parse, hypothesis, premise_parse are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1963
  Batch size = 64
Saving model checkpoint to distilbert-base-multilingual-cased-finetuned-XNLI/checkpoint-115
Configuration saved in distilbert-base-multilingual-cased-finetuned-XNLI/checkpoint-115/config.json
Model weights saved in distilbert-base-multilingual-cased-finetuned-XNLI/checkpoint-115/pytorch_model.bin
tokenizer config file saved in distilbert-base-multilingual-cased-finetuned-XNLI/checkpoint-115/tokenizer_config.json
Sp

In [None]:
 !cp -r '/content/xnli-train-en' '/content/gdrive/MyDrive/mnli'