In [1]:
!pip install transformers



In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [3]:
sample ='I will go training tonight'
bert_input  = tokenizer(sample,
                       padding = 'max_length',
                       truncation = True,
                       max_length = 10,
                       return_tensors ='pt')

In [4]:
print(bert_input.input_ids)

tensor([[ 101,  146, 1209, 1301, 2013, 3568,  102,    0,    0,    0]])


In [6]:
print(bert_input.token_type_ids)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [7]:
print(bert_input.attention_mask)

tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])


In [9]:
sample_decode = tokenizer.decode(bert_input.input_ids[0])

In [10]:
print(sample_decode)

[CLS] I will go training tonight [SEP] [PAD] [PAD] [PAD]


In [12]:
print(tokenizer.decode(bert_input.attention_mask[0]))

[unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [PAD] [PAD] [PAD]


In [13]:
print(tokenizer.decode(bert_input.token_type_ids[0]))

[PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [14]:
### Dataset Class

import torch
import numpy as np


from transformers import BertTokenizer, BertModel



In [15]:
tokenizer =  BertTokenizer.from_pretrained('bert-base-uncased')

labels = {'background':0,
          'objective': 1,
          'method':2,
          'experiemnt':3,
          'conclusion':4
        
}

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [18]:


class Dataset(torch.utils.data.Dataset):
    
    def __init__(self,df):
        self.labels = [labels[label] for label in df['categrory']]
        self.texts = [tokenizter(text,
                                padding = 'max_length',
                                max_lenth = 512,
                                trucation =True,
                                return_sensor = 'pt')  for text in df['text']]
        
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    
    
    def get_batch_labels(self, idx):
        
        return np.array(self.labels[idx])
    
    def get_batch_texts(self,idx):
        
        return self.text[idx]
    
    
    
    def __getitem__(self,idx):
        
        batch_text = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        
        
        return batch_text,batch_y
    
    
    
    

In [None]:
# dataset preparation, split test val

np.random.seed(42)
df.train, df.val,df.test = np.split(df.sample(frac =1, random_state = 122),
                                   [int(.8*len(df), int(.9*len(df))])   #[0-0.8, 0.8-0.9, 0.9-1]


X_train, X_test, Y_train, Y_test = train_test_split(df['text_orig'],
            df['sentiment'],
            test_size=0.2,
            random_state=42,
            stratify=df['sentiment'])

np.split 

If indices_or_sections is a 1-D array of sorted integers, the entries indicate where along axis the array is split. For example, [2, 3] would, for axis=0, result in
ary[:2]
ary[2:3]
ary[3:]

In [20]:
# Model Building 

from torch import nn
from transformers import BertModel, BertTokenizer



In [22]:
# Model Architecture

class BertClassifer(nn.Module):
    
    def __init__(self, dropout=0.5):
        
        super(BertClassifier,self).__init__()
        
        
        self.bert = BertModel.from_pretrained('bert-base-cased')
        
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768,5)
        self.Relu = nn.ReLU()
        
        
    def forward(self,input_id,mask):
        
        _,pooled_output = self.bert(input_ids = input_id,
                                    attention_mask = mask,
                                    return_dict =False)
        
        dropout_output = self.dropout(pooled_output)
        
        linear_output = self.linear(dropout_output)
        
        final_layer = self.ReLU(linear_output) # nn.Sigmoid()
        
        
        return final_layer
    
    
    

In [None]:
# Training Loop

from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):
    
    # data loader
    train, val =Dataset(train_data), Dataset(val_data)
    
    train_dataloader = torch.utils.data.DataLoader(train, batch_size = 2, shuffle =True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size = 2)
    
    
    # device
    use_cuda = torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')
    
    
    
    # loss function and optimizer
    
    criterion = nn.CrossEntropyLoss()
    optimizaer = Adam(model.parameters(), lr = learning_rate)
    
    #device
    if use_cuda:
        
        model = model.cuda()
        criterion = criterion.cuda()
        
        
    # trining loop
        
    for epoch_num in range(epochs):
        
        total_acc_train = 0
        total_loss_train = 0
        
        
        for train_input, train_label in tqdm(train_dataloader):
            
            #model parameters: input_id, mask
            
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeez(1).to(device)
            
            output = model(input_id,mask)
            
            # loss
            
            batch_loss = criterion(output,train_label.long())
            total_loss_train +=batch_loss.item()
            
            
            # acc
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
            
            
            # Gradient compute
            model.zero_grad() # init. weights
            
            batch_loss.backward() # backward pass
            optimizaer.step() #gradient descent
            
            
            # val loop
            total_loss_val, total_acc_val =0,0
            
            with torch.no_grad() # perform validation
                for val_input, val_label in val_dataloader:
                    
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeez(1).to(device)
                    
                    output = model(input_id,mask)
                    
                    
                    batch_loss = criterion(output,val_label.long())
                    total_loss_val +=batch_loss.item()
            
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
                    
            
            print( f'Epochs: {epoch_num +1} |Train_loss: {total_loss_train/len(train_data):.3f} \
                    | Train Acc: {total_acc_trainlen(train_data):.3f} \
                    | Val_Loss : {total_loss_val/len(val_data):.3f} \
                    | Val Acc: {total_acc_val/len(val_data):.3f} 
            
           
            )
                    
                    
                    
                    
            EPOCHS = 10
            model = BertClassfier()
            LR =1e-6
            
            train(model,df_train,df_val, LR,EPOCHS)
                    
                
            
            
            
   

In [None]:
# Inference

def evaluate(model,test_data):
    
    
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size = 2)
    
     #device
        
         
    use_cuda = torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')
    
    if use_cuda:
        
        model = model.cuda()
        
    total_test_acc = 0
    
    with torch.no_grad() # perform validation
        for test_input, test_label in test_dataloader:
            
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeez(1).to(device)
            
            output = model(input_id, mask)
            
            
            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_test_acc +=acc
            
            
        print(f'test acc: {total_test_acc/len(test_data):.3f}')
        
        
evaluate(model, df_test)
                    
            
            
    
        
    
        
        
    
    
    
    
    