# <b>Importing libraries:</b>

In [None]:
!pip install transformers[torch]
!pip install accelerate -U



In [None]:
!pip install transformers datasets



In [None]:
import numpy as np
import pandas as pd
# import seaborn as sn
# import matplotlib.pyplot as plt
import torch
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
# from sklearn.model_selection import train_test_split
from datasets import load_dataset

# <b> Dataset:</b>

In [None]:
# The Recognizing Textual Entailment (RTE) datasets come from a series of annual
# textual entailment challenges. We combine the data from RTE1 (Dagan et al.,
# 2006), RTE2 (Bar Haim et al., 2006), RTE3 (Giampiccolo et al., 2007), and RTE5
# (Bentivogli et al., 2009).4 Examples are constructed based on news and
# Wikipedia text. We convert all datasets to a two-class split, where for
# three-class datasets we collapse neutral and contradiction into not
# entailment, for consistency.

raw_data = load_dataset("glue", "rte")

# <b>Explore the Data:</b>

In [None]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 2490
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 277
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3000
    })
})

In [None]:
raw_data['train'][0]

{'sentence1': 'No Weapons of Mass Destruction Found in Iraq Yet.',
 'sentence2': 'Weapons of Mass Destruction Found in Iraq.',
 'label': 1,
 'idx': 0}

In [None]:
raw_data['train'].features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['entailment', 'not_entailment'], id=None),
 'idx': Value(dtype='int32', id=None)}

# <b>Tokenizer:</b>

In [None]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenizer_func(batch):
  return tokenizer(batch['sentence1'], batch['sentence2'], truncation=True)

tokenized_data = raw_data.map(tokenizer_func, batched=True)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/2490 [00:00<?, ? examples/s]

Map:   0%|          | 0/277 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenizer(
    raw_data['train']['sentence1'][0],
    raw_data['train']['sentence2'][0],
)

{'input_ids': [101, 1302, 20263, 1104, 8718, 14177, 17993, 17107, 1107, 5008, 6355, 119, 102, 20263, 1104, 8718, 14177, 17993, 17107, 1107, 5008, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
result = _ #The same as tokenized_data

In [None]:
tokenized_data.keys()

dict_keys(['train', 'validation', 'test'])

In [None]:
result.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
tokenizer.decode(result['input_ids'])

'[CLS] No Weapons of Mass Destruction Found in Iraq Yet. [SEP] Weapons of Mass Destruction Found in Iraq. [SEP]'

# <b>Load the model and model arguments:</b>

In [None]:
!ls

sample_data


In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification

training_args = TrainingArguments(
    output_dir='my_trainer',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_steps=150
)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#<b>Model Summary:</b>

In [None]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
!pip install torchinfo

from torchinfo import summary
summary(model)



Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              22,268,928
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 65,783,042
Trainable params: 65,783,042
Non-trainable params: 0

#<b>Training:</b>

1. Metrics:

In [None]:
from datasets import load_metric

metric = load_metric('glue', 'rte')

metric.compute(predictions=[1, 0, 1], references=[1, 0, 0])

{'accuracy': 0.6666666666666666}

In [None]:
def compute_metrics(logits_and_labels):
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis=-1)
  acc_ = np.mean(predictions == labels)
  f1_ = f1_score(labels, predictions) #, average='macro'
  return {'accuracy': acc_, 'f1': f1_}
  # print('accuracy': acc_ , 'f1': f1_)

2. Trainer:

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_data['train'],
    eval_dataset = tokenized_data['validation'],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6946,0.696124,0.465704,0.559524
2,0.6455,0.705836,0.566787,0.53125
3,0.3958,0.981679,0.566787,0.478261
4,0.1696,1.592913,0.570397,0.540541
5,0.0688,1.946556,0.577617,0.537549


TrainOutput(global_step=780, training_loss=0.3831013055948111, metrics={'train_runtime': 10641.5452, 'train_samples_per_second': 1.17, 'train_steps_per_second': 0.073, 'total_flos': 543824207151168.0, 'train_loss': 0.3831013055948111, 'epoch': 5.0})

# <b>Save Model and Pipeline:</b>

In [None]:
trainer.save_model('my_saved_rte_model')

from transformers import pipeline

p = pipeline("text-classification", model='my_saved_rte_model')

p({'text': 'I went to the store', 'text_pair': 'I am a bird'})