In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Path to the dataset
data_path = "bbc"

data = []
labels = []

# Load data from each folder
for label in os.listdir(data_path):
    folder_path = os.path.join(data_path, label)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                data.append(text)
                labels.append(label)

# Create a DataFrame
df = pd.DataFrame({'text': data, 'label': labels})

# Split into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)

In [5]:
df['label'].unique()

array(['entertainment', 'business', 'sport', 'politics', 'tech'],
      dtype=object)

In [6]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512, return_tensors='pt')
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512, return_tensors='pt')

In [7]:
import torch

# Convert labels to numeric format
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = torch.tensor(label_encoder.fit_transform(train_labels))
test_labels_encoded = torch.tensor(label_encoder.transform(test_labels))

# Create PyTorch datasets
class BBCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]

train_dataset = BBCDataset(train_encodings, train_labels_encoded)
test_dataset = BBCDataset(test_encodings, test_labels_encoded)

In [8]:
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Define DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Set optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW
from torch.nn.functional import softmax

# Define training function
def train(model, train_loader, optimizer, epochs=4):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            inputs, labels = batch
            optimizer.zero_grad()
            outputs = model(**inputs)
            loss = CrossEntropyLoss()(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader)}")

train(model, train_loader, optimizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]


Epoch 1 Loss: 0.3323753818902852
Epoch 2 Loss: 0.07535161263409204
Epoch 3 Loss: 0.06179204572358036
Epoch 4 Loss: 0.052049135712357905


In [11]:
from sklearn.metrics import accuracy_score

def evaluate(model, test_loader):
    model.eval()  # Set model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in test_loader:
            inputs, labels = batch
            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())  # Collect predictions
            all_labels.extend(labels.cpu().numpy())  # Collect true labels

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_predictions)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    return accuracy

# Evaluate the model
evaluate(model, test_loader)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]


Test Accuracy: 97.30%


0.9730337078651685

In [12]:
model.save_pretrained("./bert-classification-model")
tokenizer.save_pretrained("./bert-classification-model")

('./bert-classification-model/tokenizer_config.json',
 './bert-classification-model/special_tokens_map.json',
 './bert-classification-model/vocab.txt',
 './bert-classification-model/added_tokens.json')

In [13]:
from transformers import BertForSequenceClassification
import torch

# Path to the saved model
saved_model_path = "./bert-classification-model"

# Load the saved model
model = BertForSequenceClassification.from_pretrained(saved_model_path)

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
from sklearn.metrics import accuracy_score

def evaluate(model, test_loader):
    model.eval()  # Set model to evaluation mode
    all_predictions = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for batch in test_loader:
            inputs, labels = batch

            # Move inputs and labels to the correct device
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)

            # Get outputs and predictions
            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            
            # Collect predictions and true labels
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_score(all_labels, all_predictions)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    return accuracy

# Call the evaluate function
evaluate(model, test_loader)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]


Test Accuracy: 97.30%


0.9730337078651685

In [15]:
def print_example_with_prediction(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        for batch in test_loader:
            inputs, labels = batch
            
            # Move inputs and labels to the correct device
            inputs = {key: val.to(device) for key, val in inputs.items()}
            labels = labels.to(device)

            # Get outputs and predictions
            outputs = model(**inputs)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            # Convert tensor to readable format
            example_index = 0  # Change this to see other examples
            input_ids = inputs['input_ids'][example_index]
            attention_mask = inputs['attention_mask'][example_index]
            true_label = labels[example_index].item()
            predicted_label = predictions[example_index].item()

            # Decode input_ids to get the original text
            decoded_text = tokenizer.decode(input_ids, skip_special_tokens=True)

            # Print details
            print(f"Input Text: {decoded_text}")
            print(f"True Label: {true_label}")
            print(f"Predicted Label: {predicted_label}")

            # Stop after one example
            break

# Call the function
print_example_with_prediction(model, test_loader)

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}, self.labels[idx]


Input Text: china now top trader with japan china overtook the us to become japan's biggest trading partner in 2004, according to numbers released by japan's finance ministry on wednesday. china accounted for 20. 1 % of japan's trade in 2004, compared with 18. 6 % for the us. in 2003, the us was ahead with 20. 5 % and china came second with 19. 2 %. the change highlights china's growing importance as an economic powerhouse. in 2004, japan's imports from and exports to china ( and hong kong ) added up to 22, 201bn yen ( $ 214. 6bn ; £114. 5bn ). this is the highest figure for japanese trade with china since records began in 1947. it compares with 20, 479. 5bn yen in trade with the us. trade with the us during 2004 was hurt by one - off factors, including a 13 - month ban on us beef imports following the discovery of a cow infected with mad cow disease ( bse ) in the us. however, economists predict china will become an even more important japanese trading partner in the coming years. on 