<a href="https://colab.research.google.com/github/Savith-02/notebooks/blob/main/rnn_intent_classifier_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

In [4]:
def read_dataset(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            intent, sentence = line.strip().split(',')
            data.append((intent.strip(), sentence.strip()))
    return data

file_path = "drive/MyDrive/Code/rawData/data_small.txt"
dataset = read_dataset(file_path)[1:]
print(dataset[:5])
texts = [text for label, text in dataset]
labels = [label for label, text in dataset]

print(texts[:5], labels[:5])
# Example dataset
# texts = ["This is a good movie", "This is a bad movie", "I love this film", "I hate this film"]
# labels = [1, 0, 1, 0]

[('Greet', 'Hi'), ('Greet', 'Hello'), ('Greet', 'Hey there'), ('Greet', 'Good morning'), ('Greet', 'Howdy')]
['Hi', 'Hello', 'Hey there', 'Good morning', 'Howdy'] ['Greet', 'Greet', 'Greet', 'Greet', 'Greet']


In [5]:
print(len(texts))

365


In [6]:
all_intents = set(intent for intent, _ in dataset[1:])
intent_to_index = {intent: i for i, intent in enumerate(all_intents)}
index_to_intent = {intent: i for i, intent in intent_to_index.items()}
labels = list(map(lambda x: intent_to_index[x], labels))

In [7]:
all_intents

{'Complaint',
 'Farewell',
 'Feedback',
 'Greet',
 'Inquiry',
 'Navigation',
 'Request'}

In [8]:
indexes = set(index for index in labels)
print(f"All unique index count: {len(indexes)}")

All unique index count: 7


In [9]:
# Data Augmentation (Synonym Replacement)
def synonym_replacement(sentence):
    words = sentence.split()
    new_sentence = sentence
    for word in words:
        # Replace with a synonym (simple example, replace 'good' with 'nice')
        if word == "good":
            new_sentence = new_sentence.replace("good", "nice")
    return new_sentence

augmented_texts = [synonym_replacement(text) for text in texts]
texts.extend(augmented_texts)
labels.extend(labels)

In [10]:
num_classes = 7  # Update this number based on your dataset
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [132]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

from torch.nn.utils.rnn import pad_sequence

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

    def collate_fn(batch):
        input_ids = [item['input_ids'] for item in batch]
        attention_masks = [item['attention_mask'] for item in batch]
        labels = [item['label'] for item in batch]

        padded_input_ids = pad_sequence(input_ids, batch_first=True)
        padded_attention_masks = pad_sequence(attention_masks, batch_first=True)

        return {
            'input_ids': padded_input_ids,
            'attention_mask': padded_attention_masks,
            'label': torch.stack(labels)
        }

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, shuffle=True)

train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=train_dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=val_dataset.collate_fn)


In [133]:
print(f"Number of samples in training dataset: {len(train_dataset)}")
print(f"Number of samples in cv dataset: {len(val_dataset)}")

Number of samples in training dataset: 584
Number of samples in cv dataset: 146


In [134]:
train_dataset[5]

{'text': 'The product description did not accurately reflect its capabilities.',
 'input_ids': tensor([  101,  1996,  4031,  6412,  2106,  2025, 14125,  8339,  2049,  9859,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'label': tensor(6)}

In [135]:
# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Training loop (simplified)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
model.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [136]:
for epoch in range(3):  # Number of epochs
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()


TypeError: TextDataset.collate_fn() takes 1 positional argument but 2 were given

In [None]:
# Validation loop (simplified)
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Validation Accuracy:', correct / total)

In [11]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Data Augmentation (Synonym Replacement)
def synonym_replacement(sentence):
    words = sentence.split()
    new_sentence = sentence
    for word in words:
        # Replace with a synonym (simple example, replace 'good' with 'nice')
        if word == "good":
            new_sentence = new_sentence.replace("good", "nice")
    return new_sentence

augmented_texts = [synonym_replacement(text) for text in texts]
texts.extend(augmented_texts)
labels.extend(labels)

# Preprocessing with BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=64,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

    def collate_fn(batch):
        input_ids = [item['input_ids'] for item in batch]
        attention_masks = [item['attention_mask'] for item in batch]
        labels = [item['label'] for item in batch]

        padded_input_ids = pad_sequence(input_ids, batch_first=True)
        padded_attention_masks = pad_sequence(attention_masks, batch_first=True)

        return {
            'input_ids': padded_input_ids,
            'attention_mask': padded_attention_masks,
            'label': torch.stack(labels)
        }

# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, shuffle=True)

train_dataset = TextDataset(train_texts, train_labels)
val_dataset = TextDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=TextDataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=TextDataset.collate_fn)

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

def train(model, train_loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(train_loader)

def evaluate(model, val_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, precision, recall, f1

# Training loop
NUM_EPOCHS = 3
for epoch in range(NUM_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, val_loader)

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1-Score: {val_f1:.4f}")




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Train Loss: 1.3194, Validation Accuracy: 0.9760, Validation Precision: 0.9779, Validation Recall: 0.9760, Validation F1-Score: 0.9756
Epoch 2/3, Train Loss: 0.3250, Validation Accuracy: 1.0000, Validation Precision: 1.0000, Validation Recall: 1.0000, Validation F1-Score: 1.0000
Epoch 3/3, Train Loss: 0.0929, Validation Accuracy: 1.0000, Validation Precision: 1.0000, Validation Recall: 1.0000, Validation F1-Score: 1.0000


In [14]:
def predict_text(text, model, tokenizer, device):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}  # Move tensors to device
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=1).tolist()[0]
    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class, probabilities

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
text = "I guess this is the end"
predicted_class, probabilities = predict_text(text, model, tokenizer, device)
print("Predicted class:", predicted_class)
print("Class probabilities:", probabilities)
index_to_intent[predicted_class]

Predicted class: 1
Class probabilities: [0.02284310571849346, 0.6685171127319336, 0.01421208493411541, 0.06554695218801498, 0.019922928884625435, 0.1765691637992859, 0.03238866105675697]


'Farewell'