<a href="https://colab.research.google.com/github/ShashankHebbar006/Natural-Language-Processing/blob/main/TextClassificationUsingBERTTrained.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [39]:
path ="/content/US-Economic-News.csv"

def load_data(path):
    csv_data = pd.read_csv(path,encoding='ISO-8859-1')
    csv_data = csv_data[csv_data['relevance'] != 'not sure']
    csv_data['relevance'] = csv_data['relevance'].map({'yes':1,'no':0})

    text = csv_data['text'].to_list()
    labels = csv_data['relevance'].to_list()
    return text,labels

text, labels = load_data(path)

In [40]:
class TextClassificationDataset(Dataset):
    def __init__(self,text,labels,tokenizer,max_length):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.text)

    def __getitem__(self,idx):
        text = self.text[idx]
        label = self.labels[idx]
        encoding = tokenizer(text,return_tensors='pt',max_length=self.max_length,padding='max_length',truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(),
                'attention_mask':encoding['attention_mask'].flatten(),
                'label':torch.tensor(label)}

In [41]:
class BERTClassifier(nn.Module):
    def __init__(self,checkpoint,num_classes):
        super(BERTClassifier,self).__init__()
        self.bert = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.bert.config.hidden_size,num_classes)

    def forward(self,input_ids,attention_mask):
        output = self.bert(input_ids,attention_mask)
        pooled_output = output.pooler_output
        X = self.dropout(pooled_output)
        logits = self.linear(X)
        return logits

In [42]:
def train(model,data_loader,optimizer,scheduler,device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids,attention_mask)
        loss = nn.CrossEntropyLoss()(outputs,labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [43]:
def evaluate(model,data_loader,device):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids,attention_mask)
            _, preds = torch.max(outputs,dim=1)
            predictions.extend(preds.cpu())
            actuals.extend(labels.cpu())
    return accuracy_score(actuals,predictions), classification_report(actuals,predictions)

In [44]:
def predict_relevance(text,model,tokenizer,device,max_length=128):
    model.eval()
    encoding = tokenizer(text,max_length=max_length,return_tensors='pt',padding=True,truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids,attention_mask)
        _, preds = torch.max(outputs,dim=1)
    return 'YES' if preds.item() == 1 else 'NO'

In [45]:
# Parameters
checkpoint ='bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
epochs = 4
lr = 2e-5

In [46]:
train_text, tmp_text, train_labels, tmp_labels = train_test_split(text,labels,train_size=0.8)
val_text, test_text, val_labels, test_labels = train_test_split(tmp_text, tmp_labels, train_size=0.5)

In [47]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenized_train_ds = TextClassificationDataset(train_text,train_labels,tokenizer,max_length)
tokenized_val_ds = TextClassificationDataset(val_text,val_labels,tokenizer,max_length)
tokenized_test_ds = TextClassificationDataset(test_text,test_labels,tokenizer,max_length)

tokenized_train_dl = DataLoader(tokenized_train_ds,batch_size,shuffle=True)
tokenized_val_dl = DataLoader(tokenized_val_ds,batch_size,shuffle=True)
tokenized_test_dl = DataLoader(tokenized_test_ds,batch_size,shuffle=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [48]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(checkpoint, num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=lr)
total_steps = len(tokenized_train_dl) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [49]:
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train(model, tokenized_train_dl, optimizer, scheduler, device)
    accuracy, report = evaluate(model, tokenized_val_dl, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4
Validation Accuracy: 0.8198
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       653
           1       0.62      0.03      0.06       146

    accuracy                           0.82       799
   macro avg       0.72      0.51      0.48       799
weighted avg       0.79      0.82      0.75       799

Epoch 2/4
Validation Accuracy: 0.8010
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       653
           1       0.44      0.35      0.39       146

    accuracy                           0.80       799
   macro avg       0.65      0.63      0.64       799
weighted avg       0.78      0.80      0.79       799

Epoch 3/4
Validation Accuracy: 0.7885
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       653
           1       0.42      0.43      0.43       146

    accuracy                           0.79       799
   macro avg  

In [50]:
torch.save(model.state_dict(),'bert_classifier.pth')