In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from collections import defaultdict
from textwrap import wrap


In [4]:
## Configs
RANDOM_STATE = 78
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 15
ACCUMULATION = 2
BERT_MODEL='bert-base-uncased'
MODEL_PATH = "model.bin"
TRAINING_CSV  = "IMDB Dataset.csv"
TOKENIZER = transformers.BertTokenizer.from_pretrained(BERT_MODEL,do_lower_case=True)

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=28, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Downloading', max=466062, style=ProgressStyle(description_wid…




In [5]:
## Model

class BERT(nn.Module):
    
    def __init__(self):
        super(BERT,self).__init__()
        self.bert = transformers.BertModel.from_pretrained(BERT_MODEL)
        self.bert_dropout = nn.Dropout(0.3)
        self.out = nn.Linear(768,1)
        
        
    def forward(self,ids,mask,token_type_ids):
        #out1 = seq of hidden states for each token
        #out2 = Pooler output from BERT pooler (vector of size 768 for each batch)
        out1,out2 = self.bert(ids,attention_mask=mask,token_type_ids=token_type_ids)
        bert_out = self.bert_dropout(out2)
        output = self.out(bert_out)
        return output


In [6]:
## DataLoader

class BERTDataset:
    def __init__(self,review,target):
        self.review = review
        self.target = target
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
        
    def __len__(self):
        return len(self.review)
    
    def __getitem__(self,item):
        review = str(self.review[item])
        review = " ".join(review.split())
        #Pepare the inputs for a model.
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
        )
        
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        ''' padding_length = self.max_len - len(ids)
        ids = ids + ([0]*padding_length)
        token_type_ids = token_type_ids + ([0]*padding_length)
        mask = mask + ([0]*padding_length)'''
        
        
        return {
            'ids': torch.tensor(ids,dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids,dtype=torch.long),
            'mask': torch.tensor(mask,dtype=torch.long),
            'targets': torch.tensor(self.target[item],dtype=torch.float)
        }
        

In [11]:
## Training and evaluation
def loss_fn(outputs, targets):
    loss = nn.BCEWithLogitsLoss()(outputs,targets.view(-1,1))
    return loss

def train_fn(data_loader,model,optimizer,device,scheduler):
    accumulation_steps = ACCUMULATION
    model.train()    
    for idx,data in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = data["ids"]
        token_type_ids = data["token_type_ids"]
        mask = data["mask"]
        targets = data["targets"]
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        outputs = model(
            ids = ids,
            token_type_ids = token_type_ids,
            mask = mask
        )
        
        loss = loss_fn(outputs,targets)
        loss.backward()
        
        if (idx+1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            
def eval_fn(data_loader,model,device):
    model.eval()
    final_targets = []
    final_op = []
    
    with torch.no_grad():
        for idx,data in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = data["ids"]
            token_type_ids = data["token_type_ids"]
            mask = data["mask"]
            targets = data["targets"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(
                ids = ids,
                token_type_ids = token_type_ids,
                mask = mask
            )
            
            final_targets.extend(targets.cpu().detach().numpy().tolist())
            final_op.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            
    return final_op,final_targets

In [9]:
## Run the model

def run():
    df = pd.read_csv(TRAINING_CSV).fillna("none")
    df.sentiment = df.sentiment.apply(
                    lambda x:1 if x =="positive" else 0
                   )
    df_train, df_valid = train_test_split(
    df,
    test_size=0.1,
    random_state = RANDOM_STATE,
    stratify=df.sentiment.values   #train and valid sets have the same ratio of pos/neg samples
    )
    
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)
    
    train_dataset = BERTDataset(
        review = df_train.review.values,
        target = df_train.sentiment.values
    )
    
    train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    num_workers=4
    )
    
    valid_dataset = BERTDataset(
        review = df_valid.review.values,
        target = df_valid.sentiment.values
    )
    
    valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    num_workers=1
    )
    
    device = torch.device("cpu")
    model = BERT()
    model.to(device)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params' : [p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay':0.001},
        {'params' : [p for n,p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0}
    ]
    
    num_train_steps = int(len(df_train)/TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr = 2e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=num_train_steps)
    
    best_accuracy = 0.0
    
    for epoch in range(EPOCHS):
        train_fn(train_data_loader,model,optimizer,device,scheduler)
        outputs,targets = eval_fn(valid_data_loader,model,device)
        outputs = np.array(outputs) >= 0.5
        acc = accuracy_score(targets,outputs)
        print(f"Accuracy Score : {acc}")
        
        if acc>best_accuracy:
            torch.save(model.state_dict(),MODEL_PATH)
            best_accuracy = acc        
            

In [12]:
run()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BrokenPipeError: [Errno 32] Broken pipe