# BERT Text Classification

This notebook demonstrates how to use a pre-trained BERT model for text classification. We will use the `transformers` library by Hugging Face to fine-tune a BERT model on the dataset.

In [None]:
# Install necessary packages
%pip install transformers torch pandas scikit-learn

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Load the dataset
# Data is expected to be in the parent directory
df = pd.read_csv('../cleaned_data.csv')

# We use the 'Content' column as the feature because BERT works best with full sentences/context.
# 'Cleaned_Content' might have removed stopwords/punctuation which BERT can utilize.
X = df['Content'].astype(str).tolist()
y = df['Label'].tolist()

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)

print(f"Classes: {label_encoder.classes_}")
print(f"Number of samples: {len(X)}")

Classes: ["Alzheimer's Disease" 'Frontotemporal Dementia' 'Lewy Body Dementia'
 "Parkinson's Disease" 'Vascular Dementia']
Number of samples: 1000


In [4]:
# Load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize data
# We use max_length=128 for speed in this demo, but you can increase up to 512
inputs = tokenizer(X, padding=True, truncation=True, max_length=128, return_tensors="pt")

input_ids = inputs['input_ids']
attention_masks = inputs['attention_mask']
labels = torch.tensor(y_encoded)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
# Create Dataset and DataLoaders
dataset = TensorDataset(input_ids, attention_masks, labels)

# Split into training and validation sets (80-20 shuffle)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

batch_size = 8  # Reduced batch size to avoid OOM on smaller GPUs/CPU

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

In [6]:
# Load Pre-trained BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_classes,
    output_attentions=False,
    output_hidden_states=False,
)

model.to(device)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
classifier.bias                            | MISSING    | 
classifier.weight                          | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
# Setup Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 2  # Number of training epochs

In [8]:
# Training Loop
for epoch_i in range(0, epochs):
    print(f'======== Epoch {epoch_i + 1} / {epochs} ========')
    print('Training...')

    model.train()
    total_train_loss = 0

    for step, batch in enumerate(train_dataloader):
        if step % 10 == 0 and not step == 0:
            print(f'  Batch {step}  of  {len(train_dataloader)}.')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)

        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"  Average training loss: {avg_train_loss:.2f}")

print("Training complete!")

Training...
  Batch 10  of  100.
  Batch 20  of  100.
  Batch 30  of  100.
  Batch 40  of  100.
  Batch 50  of  100.
  Batch 60  of  100.
  Batch 70  of  100.
  Batch 80  of  100.
  Batch 90  of  100.
  Average training loss: 1.49
Training...
  Batch 10  of  100.
  Batch 20  of  100.
  Batch 30  of  100.
  Batch 40  of  100.
  Batch 50  of  100.
  Batch 60  of  100.
  Batch 70  of  100.
  Batch 80  of  100.
  Batch 90  of  100.
  Average training loss: 1.00
Training complete!


In [9]:
# Evaluation
print("Running Validation...")

model.eval()
predictions, true_labels = [], []

for batch in validation_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.extend(np.argmax(logits, axis=1).flatten())
    true_labels.extend(label_ids.flatten())

print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))
print(f"Accuracy: {accuracy_score(true_labels, predictions)}")

Running Validation...
Classification Report:
                         precision    recall  f1-score   support

    Alzheimer's Disease       0.58      0.73      0.65        41
Frontotemporal Dementia       0.88      0.61      0.72        36
     Lewy Body Dementia       0.68      0.59      0.63        39
    Parkinson's Disease       0.71      0.71      0.71        49
      Vascular Dementia       0.70      0.80      0.75        35

               accuracy                           0.69       200
              macro avg       0.71      0.69      0.69       200
           weighted avg       0.71      0.69      0.69       200

Accuracy: 0.69


In [10]:
# Save the model and tokenizer
output_dir = './bert_classifier_model'
import os

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to ./bert_classifier_model


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./bert_classifier_model\\tokenizer_config.json',
 './bert_classifier_model\\tokenizer.json')