#<b>Dataset</b>

In [1]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [2]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [3]:
import numpy as np
from datasets import load_dataset

In [4]:
#load custom data
#or load the benchmark GLUE dataset
#for the "sst2" (sentiment analysis task)

raw_data = load_dataset("glue", "sst2")

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

# <b> Explore the data:

In [5]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [6]:
raw_data['train']

Dataset({
    features: ['sentence', 'label', 'idx'],
    num_rows: 67349
})

In [7]:
type(raw_data['train'])

datasets.arrow_dataset.Dataset

In [8]:
#See what methods and attributes the object has
dir(raw_data['train'])

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_with_indices_mappin

In [9]:
raw_data['train'].data

MemoryMappedTable
sentence: string
label: int64
idx: int32
----
sentence: [["hide new secretions from the parental units ","contains no wit , only labored gags ","that loves its characters and communicates something rather beautiful about human nature ","remains utterly satisfied to remain the same throughout ","on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ",...,"you wish you were at home watching that movie instead of in the theater watching this one ","'s no point in extracting the bare bones of byatt 's plot for purposes of bland hollywood romance ","underdeveloped ","the jokes are flat ","a heartening tale of small victories "],["suspense , intriguing characters and bizarre bank robberies , ","a gritty police thriller with all the dysfunctional family dynamics one could wish for ","with a wonderful ensemble cast of characters that bring the routine day to day struggles of the working class to life ","nonetheless appreciates the art and reveals a music sc

In [10]:
raw_data['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [11]:
raw_data['train'][1:10]

{'sentence': ['contains no wit , only labored gags ',
  'that loves its characters and communicates something rather beautiful about human nature ',
  'remains utterly satisfied to remain the same throughout ',
  'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up ',
  "that 's far too tragic to merit such superficial treatment ",
  'demonstrates that the director of such hollywood blockbusters as patriot games can still turn out a small , personal film with an emotional wallop . ',
  'of saucy ',
  "a depressed fifteen-year-old 's suicidal poetry ",
  "are more deeply thought through than in most ` right-thinking ' films "],
 'label': [0, 1, 0, 0, 0, 1, 1, 0, 1],
 'idx': [1, 2, 3, 4, 5, 6, 7, 8, 9]}

In [12]:
raw_data['train'].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

# <b>Tokenizer:</b>

In [13]:
#define the tokenizer

from transformers import AutoTokenizer

#checkpoint = "bert-base-uncased"
checkpoint = "distilbert-base-uncased" #trains faster than "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
#Example:

tokenized_sentences = tokenizer(raw_data['train'][0]['sentence'])

from pprint import pprint
pprint(tokenized_sentences)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102]}


In [15]:
#Apply the tokenizer to the dataset:
#It's better (to speed up processing) to define a tokenizer function and then use the "map" method to apply the tokenizer function to the dataset:

def tokenizer_func(batch):
  return tokenizer(batch['sentence'], truncation=True)

tokenized_dataset = raw_data.map(tokenizer_func, batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

# <b> Load the model and model arguments:</b>

In [16]:
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification

training_args = TrainingArguments(
  'my_trainer',
  evaluation_strategy='epoch',
  save_strategy='epoch',
  num_train_epochs=1 #at the end, increase it and see what happens (apparently it overfits)
)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# <b>Model Summary:</b>

In [17]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [18]:
model #shows the same output as "model.named_parameters"

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [19]:
! pip install torchinfo

from torchinfo import summary

summary(model)

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


Layer (type:depth-idx)                                  Param #
DistilBertForSequenceClassification                     --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              23,440,896
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Linear: 1-2                                           590,592
├─Linear: 1-3                                           1,538
├─Dropout: 1-4                                          --
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0

# <b> Save the model parameters before fine-tuning:</b>

In [20]:
#Save all the model parameters in a list called "params_before" to compare against the updated parameters after fine-tuning the transformer.
params_before = []
for name, p in model.named_parameters():
  params_before.append(p.detach().cpu().numpy())

# <b> Training:
 1. Metrics</b>

In [21]:
from datasets import load_metric

metrics = load_metric("glue", "sst2")

def compute_metrics(logits_and_labels): #logits is basically the preds
  logits, labels = logits_and_labels
  predictions = np.argmax(logits, axis =-1)
  return metrics.compute(predictions=predictions, references=labels)

  metrics = load_metric("glue", "sst2")


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

In [22]:
#Example:
#Computing metric:

metrics.compute(predictions=[1, 0, 1], references=[1, 0, 0])

{'accuracy': 0.6666666666666666}

# <b> Training:
 2. Model</b>

In [23]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1877,0.354331,0.904817


TrainOutput(global_step=8419, training_loss=0.26396597687684664, metrics={'train_runtime': 417.7631, 'train_samples_per_second': 161.213, 'train_steps_per_second': 20.153, 'total_flos': 517212489917652.0, 'train_loss': 0.26396597687684664, 'epoch': 1.0})

# <b> Save the model: </b>

In [24]:
trainer.save_model("saved_sst2_fineTuned_model")

In [25]:
!ls

my_trainer  sample_data  saved_sst2_fineTuned_model


In [26]:
ls 'saved_sst2_fineTuned_model'

config.json        special_tokens_map.json  tokenizer.json     vocab.txt
model.safetensors  tokenizer_config.json    training_args.bin


# <b> Test the model on an example text</b>



In [27]:
from transformers import pipeline

newmodel = pipeline("text-classification", model="saved_sst2_fineTuned_model") #, device=0

print(newmodel('This is great!'))

print(newmodel('This movie sucks'))

[{'label': 'LABEL_1', 'score': 0.9997462630271912}]
[{'label': 'LABEL_0', 'score': 0.9983289837837219}]


# <b> Now, let's convert 'LABEL_1' and 'LABEL_0' in the result above to "0: 'negative', 1: 'positive'"</b>





In [28]:
!cat saved_sst2_fineTuned_model/config.json #opens the file for viewing in the cli

#result: doesn't show the label :( so let's do the next thing:

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "vocab_size": 30522
}


In [29]:
import json

config_path = 'saved_sst2_fineTuned_model/config.json'
with open(config_path) as f: #open the json file
  j = json.load(f)

j['id2label'] = {0: 'negative', '1': 'positive'} #create a new dictionary called 'id2label'

with open(config_path, 'w') as f: #update (aka write) the json file
  json.dump(j, f, indent=2)

In [30]:
!cat saved_sst2_fineTuned_model/config.json

{
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.35.2",
  "vocab_size": 30522,
  "id2label": {
    "0": "negative",
    "1": "positive"
  }
}

# <b> Test the model on an example text again</b>

In [31]:
newmodel = pipeline("text-classification", model="saved_sst2_fineTuned_model") #, device=0

print(newmodel('This is great!'))

print(newmodel('This movie sucks'))

[{'label': 'positive', 'score': 0.9997462630271912}]
[{'label': 'negative', 'score': 0.9983289837837219}]


# <b>Save the model parameters after fine-tuning; and then compare with before:</b>

In [32]:
params_after = []
for name, p in model.named_parameters():
  params_after.append(p.detach().cpu().numpy())

In [33]:
for p1, p2 in zip(params_before, params_after): #for comparing, we usually use zip in Python
  print(np.sum(np.abs(p1 - p2)))

13503.262
85.16359
1.7868844
1.1405628
1311.9707
1.7368608
1299.0294
0.0033914645
1197.6191
1.0854478
1125.0051
0.8475778
1.6540253
0.84432065
4921.596
5.7749586
4521.381
0.71666455
1.6108947
0.7244039
1284.3597
1.4973706
1288.0431
0.003396339
1114.5853
0.8799112
1060.7526
0.73865235
1.5586846
0.72353995
4872.1167
5.4087296
4420.1177
0.654356
1.4916724
0.76657295
1285.8015
1.5589876
1294.1683
0.0025788331
1119.6481
0.7961755
1092.9095
0.7496387
1.5598629
0.8123528
4936.463
5.6844873
4398.5127
0.71108365
1.4026194
0.6981336
1289.182
1.4603357
1303.9655
0.002862931
1145.7084
0.7271247
1103.9149
0.7586862
1.3631746
0.73996323
4826.6733
5.4694834
4178.634
0.7478663
1.3599589
0.7582741
1206.4437
1.4697571
1203.7186
0.0021012037
1012.8521
0.82621676
1018.6957
0.9332739
1.4186852
0.94495595
4530.53
5.4601974
3650.6047
0.83253145
1.3349187
0.8724362
1139.6276
1.3439008
1156.233
0.0013632653
944.66
0.9359058
947.569
1.0522516
1.3288126
1.1577582
3801.2341
4.895415
3347.195
0.9374357
1.399147
0.