In [1]:
#DOCUMENT CLASSIFICATION USING BERT AND TRANSFORMERS

#S.K.M.Sanjana (22BCE7474)

# Install required packages
!pip install transformers torch pandas scikit-learn tqdm

import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import pandas as pd

# For file upload in Colab
from google.colab import files

# Step 1: Upload your dataset
uploaded = files.upload()  # Select your CSV file with 'text' and 'label' columns
file_name = list(uploaded.keys())[0]

# Load dataset
df = pd.read_csv(file_name)
print("\nDataset Preview:")
print(df.head())

# Configuration
TEXT_COLUMN = "text"
LABEL_COLUMN = "label"

# Label preprocessing
label_map = {i: label for i, label in enumerate(df[LABEL_COLUMN].astype("category").cat.categories)}
df[LABEL_COLUMN] = df[LABEL_COLUMN].astype("category").cat.codes

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Fixed Dataset Class
class DocumentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }

# Create dataset
dataset = DocumentDataset(df[TEXT_COLUMN], df[LABEL_COLUMN], tokenizer)

# Train/Validation split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# DataLoader setup
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# Model initialization
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label_map)
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training setup
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Training loop
EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{EPOCHS}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")

# Evaluation function
def evaluate_model(model, dataloader, device, label_map):
    model.eval()
    preds, true_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            preds.extend(predictions.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, preds)
    f1 = f1_score(true_labels, preds, average="weighted")
    report = classification_report(true_labels, preds, target_names=list(label_map.values()))

    return accuracy, f1, report

# Validation evaluation
val_accuracy, val_f1, val_report = evaluate_model(model, val_loader, device, label_map)
print("\nValidation Results:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"F1 Score: {val_f1:.4f}")
print("Classification Report:")
print(val_report)

# Prediction function
def predict_text(model, tokenizer, text, device, label_map):
    model.eval()
    encoding = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        prediction = torch.argmax(probs, dim=-1).item()

    return label_map[prediction], probs.cpu().numpy()[0]

# Test predictions
test_samples = [
    "Invoice number 12345 for services rendered",
    "Monthly financial report Q3 2023",
    "Purchase order request for 100 units",
    "Shipping confirmation for order #7890"
]

print("\nTest Predictions:")
for sample in test_samples:
    pred_label, probabilities = predict_text(model, tokenizer, sample, device, label_map)
    print(f"\nText: {sample}")
    print(f"Predicted: {pred_label}")
    print(f"Probabilities: {probabilities}")


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Saving company-document-text.csv to company-document-text.csv

Dataset Preview:
                                                text          label  \
0  order id  10718 shipping details  ship name  k...  ShippingOrder   
1  invoice order id  10707 customer id  arout ord...        invoice   
2  order id  10448 shipping details  ship name  r...  ShippingOrder   
3  invoice order id  11068 customer id  queen ord...        invoice   
4  order id  10656 shipping details  ship name  g...  ShippingOrder   

   word_count  
0         120  
1          66  
2          96  
3          68  
4         109  


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1/3: 100%|██████████| 67/67 [46:52<00:00, 41.98s/it]


Epoch 1 completed. Average Loss: 0.3814


Training Epoch 2/3: 100%|██████████| 67/67 [46:41<00:00, 41.81s/it]


Epoch 2 completed. Average Loss: 0.0344


Training Epoch 3/3: 100%|██████████| 67/67 [46:41<00:00, 41.82s/it]


Epoch 3 completed. Average Loss: 0.0108

Validation Results:
Accuracy: 1.0000
F1 Score: 1.0000
Classification Report:
                precision    recall  f1-score   support

 ShippingOrder       1.00      1.00      1.00       173
       invoice       1.00      1.00      1.00       163
purchase Order       1.00      1.00      1.00       162
        report       1.00      1.00      1.00        38

      accuracy                           1.00       536
     macro avg       1.00      1.00      1.00       536
  weighted avg       1.00      1.00      1.00       536


Test Predictions:

Text: Invoice number 12345 for services rendered
Predicted: purchase Order
Probabilities: [0.09937808 0.1583935  0.5840553  0.15817314]

Text: Monthly financial report Q3 2023
Predicted: purchase Order
Probabilities: [0.0199279  0.11074626 0.7156536  0.15367226]

Text: Purchase order request for 100 units
Predicted: purchase Order
Probabilities: [0.01484638 0.01010433 0.9497017  0.02534752]

Text: Shipping c