# Text Classification-Classifying the text as Good or Bad

In [1]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
import os

In [9]:
df = pd.read_csv('text_classification.csv')
df.head()

Unnamed: 0,text,label
0,Biting your lip or cheek when stressed,Bad habit
1,Finding ways to reduce plastic waste by using ...,Good Habit
2,Not being mindful of your environmental impact,Bad habit
3,Learning a new skill or hobby to promote perso...,Good Habit
4,Being open to constructive feedback and criticism,Good Habit


In [10]:
len(df['text'])

400

In [11]:
good = 0
bad = 0
for i in df['label']:
    if i == 'Good Habit':
        good+=1
    else:
        bad+=1

print("No. of Good habits: ", good)
print("No. of Bad habits: ", bad)

No. of Good habits:  200
No. of Bad habits:  200


In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
attention_masks = []

for text in df['text']:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 64,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = []
for i in df['label']:
  if i == 'Good Habit':
    labels.append(1)
  else:
    labels.append(0)
labels = torch.tensor(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

In [14]:
batch_size = 32
train_dataloader = DataLoader(train_dataset,
                              sampler = RandomSampler(train_dataset),
                              batch_size = batch_size)

test_dataloader = DataLoader(test_dataset,
                             sampler = SequentialSampler(test_dataset),
                             batch_size = batch_size)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)
model.to(device)

optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

epochs = 4
total_steps = len(train_dataloader) * epochs

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_attention_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(batch_input_ids,
                        token_type_ids=None,
                        attention_mask=batch_attention_masks,
                        labels=batch_labels)

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Average training loss: {}".format(avg_train_loss))


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average training loss: 0.670391833782196
Average training loss: 0.401170539855957
Average training loss: 0.2001398652791977
Average training loss: 0.09888949096202851


In [16]:
model.eval()

predictions = []
true_labels = []

for batch in test_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_attention_masks = batch[1].to(device)
    batch_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(batch_input_ids,
                        token_type_ids=None,
                        attention_mask=batch_attention_masks)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()

    batch_predictions = logits.argmax(axis=1).flatten()
    batch_labels = label_ids.flatten()

    predictions.extend(batch_predictions)
    true_labels.extend(batch_labels)

accuracy = accuracy_score(true_labels, predictions)
print("Test accuracy: {}".format(accuracy))


Test accuracy: 1.0


As we can see, it gives a perfect 100% accuracy on out test data. It is important to note that achieving 100% accuracy on test data does not necessarily mean that the BERT model is perfect for text classification on the dataset. While the model may perform well on the specific data it was trained and tested on, its performance may vary when applied to new and unseen data.