In [1]:
import torch
from datasets import load_dataset
from transformers import BertForSequenceClassification,AutoTokenizer
from torch.utils.data import DataLoader,Dataset

In [35]:
#Initializig device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data=load_dataset("imdb")[0]
print(data)

### Custom Dataset Class

We define a custom PyTorch Dataset to wrap our tokenized IMDB data.  
This makes it compatible with DataLoader for batching and shuffling during training.

Each item returned includes:
- input_ids
- attention_mask
- labels


In [53]:
#Custom PyTorch Dataset to work with tokenized data
class CustomDataset(Dataset):
    def __init__(self,tokens):
        self.input_id=tokens["input_ids"]
        self.attention_mask=tokens["attention_mask"]
        self.labels=tokens["labels"]
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self,index):
        return{"input_ids":self.input_id[index],
              "attention_mask":self.attention_mask[index],
              "labels":self.labels[index]}

In [54]:

#A function to tokenize prepare dataLoader
def load_and_tokenize_data(tokenizer, data,mode ="train", max_len=512, sample_size=15,shuffle = True):
    samples = data[mode][0:sample_size]["text"] #slicing few samples only bcos of no gpu
    tokens=tokenizer(samples,max_length=max_len,truncation=True,padding="max_length",return_tensors="pt")
     # Adding labels
    tokens['labels'] = data[mode][0:sample_size]["label"]
    # Wrap tokens with customDataset and return dataLoader
    dataloader_samples = DataLoader(CustomDataset(tokens),shuffle =shuffle)
    return tokens, dataloader_samples

In [42]:
# function to train the model
def train_model_func(model, train_loader, optimizer, device="cpu", epochs=3):
    model.to(device)
    model.train()
    epoch = epochs
    for e in range(epoch):
        total_loss = 0
        for batch in train_loader :
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            output = model(input_ids,attention_mask,labels =labels)
            loss = output.loss 
            total_loss = total_loss + loss.item()
            loss.backward() #here gradients will be calc using backpropagation
            optimizer.step() #Update weights
            optimizer.zero_grad() #Reset gradients
        print(f"epoch {e +1}-------{total_loss}")   

In [58]:
# function to evaluate the model

def eval_func(model,test_loader):
    model.eval()
    actual_label = 0
    pred_labels = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids=batch["input_ids"].to(device)
            attention_mask=batch["attention_mask"].to(device)
            labels=batch["labels"].to(device)
            output = model(input_ids,attention_mask,labels =labels)
            logit = output.logits 
            pred = torch.argmax(logit,dim = 1) 
            actual_label = actual_label + (pred == labels).sum().item()
            pred_labels = pred_labels + labels.size(0)
        accuracy = actual_label / pred_labels
        print("accuracy_score",accuracy)

In [44]:
# function to make prediction on new text
def prediction_func(model,text,tokenizer,max_len=512):
    sentiments={0:"Negative",1:"Positive"}
    tokens=tokenizer(text,max_length=max_len,truncation=True,padding="max_length",return_tensors="pt")
    
    model.to(device)
    
    model.eval()
    attention_mask=tokens["attention_mask"].to(device)
    input_ids=tokens["input_ids"].to(device)
    
    with torch.no_grad():
        output=model(attention_mask=attention_mask,input_ids=input_ids)
        logits=output.logits
        pred=torch.argmax(logits,dim=1).item()
        
        return sentiments[pred]

In [45]:
# Initializing BertFor Classification model here
model=BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [46]:
#Load Tokenizer
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased",num_labels=2)

In [47]:
#Setting a optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.00002)

In [55]:
# Now we are using our previous func to test a new review

tokens,train_loader = load_and_tokenize_data(tokenizer, data, mode="train", sample_size=50)
tokens,test_loader = load_and_tokenize_data(tokenizer, data, mode="test", sample_size=30, shuffle=False)


# train the model
train_model_func(model, train_loader, optimizer, device=device, epochs=2)


epoch 1-------5.485954730771482
epoch 2-------0.3543743977788836


In [59]:
# evaluate the model
eval_func(model, test_loader)

accuracy_score 1.0


In [57]:
# test prediction
sample_text = "this movie was absolutely fantastic, i loved every part of it"
print("prediction:", prediction_func(model, sample_text, tokenizer))

prediction: Negative
