In [9]:
from datasets import load_dataset
from collections import Counter
import pandas as pd
import torch
from transformers.tokenization_utils_base import BatchEncoding
from typing import List, Tuple
from tqdm import tqdm
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification,AdamW, get_linear_schedule_with_warmup

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [3]:
labels = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech",
}
 
K = len(labels)

dataset = load_dataset("ag_news")
dataset

Using custom data configuration default
Reusing dataset ag_news (C:\Users\Siri\.cache\huggingface\datasets\ag_news\default\0.0.0\bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [21]:
batch_size = 8
epochs = 4
lr_init = 5e-5
max_len = 256
warmup_steps = 3

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [22]:
def batch_data(data: pd.DataFrame, bsize: int) -> List[Tuple[BatchEncoding, List[int], List[str]]]:
    lst = []
    l = len(data)//bsize
    for i in range(l):
        batch_text = data['text'][bsize*i:bsize*(i+1)].tolist()
        X = tokenizer.batch_encode_plus(batch_text, truncation =True, padding= 'max_length',max_length = max_len, add_special_tokens=True, return_tensors='pt')
        Y = torch.LongTensor(data['label'][bsize*i:bsize*(i+1)].tolist())
        s = batch_text
        lst.append((X,Y,s))
    return lst

In [23]:
# subset
df_train = dataset['train'].to_pandas().sample(frac=1).reset_index(drop=True)[:40]
df_test = dataset['test'].to_pandas().sample(frac=1).reset_index(drop=True)[:40]

In [24]:
train_batches = batch_data(df_train, bsize=batch_size)
test_batches = batch_data(df_test, bsize=batch_size)

In [25]:
# Transformer model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=K, output_hidden_states=True)

# The torch `device` on which to execute the model computation
if torch.cuda.is_available():
    device = torch.device('cuda:0') # GPU
else:
    device = torch.device('cpu') # CPU
model.to(device)

# The gradient descent optimizer used for fine tuning
optimizer = AdamW(model.parameters(), lr=lr_init)

# The gradient descent learning rate
lr = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, 
    num_training_steps=len(train_batches))

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.wei

In [26]:
from datasets import load_metric
from tqdm.auto import tqdm


def runner(batches, train=True):
    if train == True:
        # train loop
        model.train()
        
        for epoch in range(epochs):

            progress_bar = tqdm(range(len(batches))) # add tqdm bar
            a = load_metric("accuracy") # load accuracy metric
            prediction = []
            acc = []
            loss_lst = []
            

            for item in batches:
                # get each batch from batches
                batch = {'input_ids':torch.as_tensor(item[0]['input_ids'], device=device),
                         'attention_mask':torch.as_tensor(item[0]['attention_mask'], device=device),
                         'labels': torch.as_tensor(item[1], device=device)}
                
                with torch.enable_grad():
                    
                    # if train is true, so this gradient descent process
                    outputs = model(**batch)
                    loss = outputs.loss
                    loss.backward()
                    optimizer.step()
                    lr.step()
                    optimizer.zero_grad()
                    
                    progress_bar.update(1) # update progree bar
                
                # add the model outputs to corresponding results list
                predict_label = torch.argmax(outputs.logits, dim=1) # get the label with the maximum probability
                prediction.append(predict_label)
                loss_lst.append(loss)
                acc.append(a.compute(predictions = predict_label, references = item[1])['accuracy'])
           
            # compute mean metrics computed over data in batches
            mean_loss = float(sum(loss_lst)/len(loss_lst))
            mean_accuracy = sum(acc)/len(acc)

        
        
    else:
        
        # testing loop
        model.eval()
        
        for epoch in range(epochs):

            progress_bar = tqdm(range(len(batches))) # add tqdm bar
            a = load_metric("accuracy") # load accuracy metric
            prediction = []
            acc = []
            loss_lst = []
            

            for item in batches:
                # get each batch from batches
                batch = {'input_ids':torch.as_tensor(item[0]['input_ids'], device=device),
                         'attention_mask':torch.as_tensor(item[0]['attention_mask'], device=device),
                         'labels': torch.as_tensor(item[1], device=device)}
                
                # if train is not true, just get the results from the current model
                with torch.no_grad():
                    outputs = model(**batch)
                    
                    # update progree bar
                    progress_bar.update(1)
                
                # add the model outputs to corresponding results list
                predict_label = torch.argmax(outputs.logits, dim=1) # get the label with the maximum probability
                prediction.append(predict_label)
                loss_lst.append(outputs.loss)
                acc.append(a.compute(predictions = predict_label, references = item[1])['accuracy'])
            
            # compute mean metrics computed over data in batches
            mean_loss = float(sum(loss_lst)/len(loss_lst))
            mean_accuracy = sum(acc)/len(acc)
            
        
    return (prediction,mean_loss, mean_accuracy)

In [27]:
train_results = runner(train_batches, train = True)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [29]:
print('Traning Set:')

print('loss:',train_results[1])
print('accuracy:',train_results[2])

Traning Set:
loss: 1.2863134145736694
accuracy: 0.35


In [30]:
test_results = runner(test_batches, train = False)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [31]:
print('Testing Set:')

print('loss:',test_results[1])
print('accuracy:',test_results[2])

Testing Set:
loss: 1.3811625242233276
accuracy: 0.2
