In [None]:
#this notebook finetunes dsitilbert-base-uncased classification models on ANLI dataset
#the primary usage was to use this finetuned model for building a new zero shot classification model

In [None]:
!pip install transformers datasets

In [2]:
from datasets import load_dataset
dataset = load_dataset("anli")

Downloading builder script:   0%|          | 0.00/2.00k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading and preparing dataset anli/plain_text (download: 17.76 MiB, generated: 73.55 MiB, post-processed: Unknown size, total: 91.31 MiB) to /root/.cache/huggingface/datasets/anli/plain_text/0.1.0/aabce88453b06dff21c201855ea83283bab0390bff746deadb30b65695755c0b...


Downloading data:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Generating train_r1 split:   0%|          | 0/16946 [00:00<?, ? examples/s]

Generating dev_r1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test_r1 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_r2 split:   0%|          | 0/45460 [00:00<?, ? examples/s]

Generating dev_r2 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test_r2 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_r3 split:   0%|          | 0/100459 [00:00<?, ? examples/s]

Generating dev_r3 split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Generating test_r3 split:   0%|          | 0/1200 [00:00<?, ? examples/s]

Dataset anli downloaded and prepared to /root/.cache/huggingface/datasets/anli/plain_text/0.1.0/aabce88453b06dff21c201855ea83283bab0390bff746deadb30b65695755c0b. Subsequent calls will reuse this data.


  0%|          | 0/9 [00:00<?, ?it/s]

In [3]:
import pandas as pd
import time

In [4]:
#train and validation data
df_train1=dataset['train_r1'].to_pandas()
df_val1=dataset['test_r1'].to_pandas()
df_train1.head(2)

Unnamed: 0,uid,premise,hypothesis,label,reason
0,0fd0abfb-659e-4453-b196-c3a64d2d8267,"The Parma trolleybus system (Italian: ""Rete fi...",The trolleybus system has over 2 urban routes,0,
1,7ed72ff4-40b7-4f8a-b1b9-6c612aa62c84,Alexandra Lendon Bastedo (9 March 1946 – 12 Ja...,Sharron Macready was a popular character throu...,1,


In [5]:
df_val1.head(2)

Unnamed: 0,uid,premise,hypothesis,label,reason
0,4aae63a8-fcf7-406c-a2f3-50c31c5934a9,Ernest Jones is a British jeweller and watchma...,The first Ernest Jones store was opened on the...,0,"The first store was opened in London, which is..."
1,c577b92c-78fb-4e1d-ae1d-34133609c142,Old Trafford is a football stadium in Old Traf...,There are only 10 larger football stadiums in ...,0,The text says that it is the 11th largest foot...


In [6]:
df_train1['len_premise']=df_train1['premise'].apply(len)
df_train1['len_hypo']=df_train1['hypothesis'].apply(len)

df_train1['total_len_pre_hypo']=df_train1['len_premise']+df_train1['len_hypo']

In [7]:
pd.DataFrame(df_train1['len_premise'].describe()).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
len_premise,16946.0,326.411247,44.052357,250.0,288.0,327.0,365.0,400.0


In [8]:
pd.DataFrame(df_train1['len_hypo'].describe()).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
len_hypo,16946.0,63.594595,32.682601,15.0,43.0,56.0,74.0,357.0


In [9]:
pd.DataFrame(df_train1['total_len_pre_hypo'].describe()).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_len_pre_hypo,16946.0,390.005842,54.598097,271.0,347.0,390.0,428.0,730.0


In [10]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import os

from transformers import BertForSequenceClassification,BertTokenizer, AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [11]:
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

In [12]:
torch.cuda.get_device_name(0)

'Tesla K80'

In [13]:
device

device(type='cuda')

In [14]:
class AnliDataPrep():

  def __init__(self,
               train_df,
               val_df,
               model='distilbert-base-uncased'):
    
    self.train_df=train_df #user must pass train data
    self.val_df=val_df #user must pass val data
    self.tokenizer=AutoTokenizer.from_pretrained(model) #corresponding tokenizer
    self.train_data_final=self.prep_dataset(self.train_df)
    self.val_data_final=self.prep_dataset(self.val_df)

  def prep_dataset(self,df):#pass train/val data to get them into required format
    #creating dataset to feed the model for both train and validation data
    MAX_LEN=512 #max len for the model.. we will truncate if text is loner than this]
    input_ids_all=[] # append token ids list from tokeinzed  data
    attention_mask_ids_all=[] #ids to separte real tokens from padded ones
    segment_ids_all=[] #create segemnt ids to separate premise and hypothesis

    premise_list=df['premise'].tolist()
    hypothesis_list=df['hypothesis'].tolist()
    labels_list=df['label'].tolist()

    for (premise,hypothesis) in zip(premise_list,hypothesis_list):
      premise_token_ids=self.tokenizer.encode(premise,add_special_tokens = False)
      hypothesis_token_ids=self.tokenizer.encode(hypothesis,add_special_tokens = False)
      #input id structure ->  '[CLS]'+ Premise_ids + '[SEP]' + Hypothesis_ids + '[SEP]
      input_ids=[self.tokenizer.cls_token_id] + premise_token_ids + [self.tokenizer.sep_token_id] + hypothesis_token_ids + [self.tokenizer.sep_token_id]

      #creating segemnt ids and attention mask ids for the model
      premise_len=len(premise_token_ids)
      hypothesis_len=len(hypothesis_token_ids)

      segment_ids=torch.tensor([0]*(premise_len+2)+[1]*(hypothesis_len+1))
      attention_mask_ids=torch.tensor([1]*(premise_len+hypothesis_len+3))

      input_ids_all.append(torch.tensor(input_ids))
      segment_ids_all.append(segment_ids)
      attention_mask_ids_all.append(attention_mask_ids)

    input_ids_all=pad_sequence(input_ids_all,batch_first=True)
    segment_ids_all=pad_sequence(segment_ids_all,batch_first=True)
    attention_mask_ids_all=pad_sequence(attention_mask_ids_all,batch_first=True)
    labels=torch.tensor(labels_list)

    dataset=TensorDataset(input_ids_all,attention_mask_ids_all,segment_ids_all,labels)
    return dataset

  def train_data_loader(self):
    train_data_loader=DataLoader(dataset=self.train_data_final,batch_size=32,shuffle=True)
    return train_data_loader

  def val_data_loader(self):
    val_data_loader=DataLoader(dataset=self.val_data_final,batch_size=32,shuffle=True)
    return val_data_loader
    

In [15]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=3)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [16]:
#unfreeze nth layer and beyond
for name, param in list(model.named_parameters())[:-20]: 
    param.requires_grad = False

# check which layers are frozen
for name, param in model.named_parameters():
     print(name, param.requires_grad)

distilbert.embeddings.word_embeddings.weight False
distilbert.embeddings.position_embeddings.weight False
distilbert.embeddings.LayerNorm.weight False
distilbert.embeddings.LayerNorm.bias False
distilbert.transformer.layer.0.attention.q_lin.weight False
distilbert.transformer.layer.0.attention.q_lin.bias False
distilbert.transformer.layer.0.attention.k_lin.weight False
distilbert.transformer.layer.0.attention.k_lin.bias False
distilbert.transformer.layer.0.attention.v_lin.weight False
distilbert.transformer.layer.0.attention.v_lin.bias False
distilbert.transformer.layer.0.attention.out_lin.weight False
distilbert.transformer.layer.0.attention.out_lin.bias False
distilbert.transformer.layer.0.sa_layer_norm.weight False
distilbert.transformer.layer.0.sa_layer_norm.bias False
distilbert.transformer.layer.0.ffn.lin1.weight False
distilbert.transformer.layer.0.ffn.lin1.bias False
distilbert.transformer.layer.0.ffn.lin2.weight False
distilbert.transformer.layer.0.ffn.lin2.bias False
distilbe

In [17]:
model=model.to(device)

In [18]:
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

EPOCHS = 10

def train(model, train_loader, val_loader, optimizer):  
  total_step = len(train_loader)

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, labels) in enumerate(train_loader):
      optimizer.zero_grad()

      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = labels.to(device)

      loss, prediction = model(pair_token_ids, 
                            #  token_type_ids=seg_ids,
                             attention_mask=mask_ids, 
                             labels=labels).values()

      acc = multi_acc(prediction, labels)

      loss.backward()
      optimizer.step()
      
      total_train_loss += loss.item()
      total_train_acc  += acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    with torch.no_grad():
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, labels) in enumerate(val_loader):
        optimizer.zero_grad()

        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = labels.to(device)
        
        loss, prediction = model(pair_token_ids, 
                            #  token_type_ids=seg_ids, #token type ids not required in distilbert model
                             attention_mask=mask_ids, 
                             labels=labels).values()
        
        acc = multi_acc(prediction, labels)

        total_val_loss += loss.item()
        total_val_acc  += acc.item()

    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [None]:
anli_dataset=AnliDataPrep(train_df=df_train1,
               val_df=df_val1.sample(100))


train_loader, val_loader = anli_dataset.train_data_loader(),anli_dataset.val_data_loader()

optimizer = AdamW(model.parameters(), lr=2e-5)

train(model, train_loader, val_loader, optimizer)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

