In [None]:
import pandas as pd
import os

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm.auto import tqdm

# Hugging Face (Transformers) BERT Sequence Classification
This notebook utilizes the Hugging Face Transformers library to perform sequence classification using a pretrained bidirectional transformer on unlabeled data. One such model we use is called BERT. BERT is employed as both a tokenizer and a sequence classification model. This notebook also makes heavy use of `AutoTokenizer` and `AutoModel` to easily integrate the BERT model for this classification task.

## Detailed information

### BERT
In this notebook the `bert-bases-uncases` with 110M parameters from google is used. [MORE INFROMATION IS COMMING] 
- 12 transformer block layers
- Hidden size of 768
- linear layer and softmax

![Alt text for the image](https://www.researchgate.net/publication/374608193/figure/fig2/AS:11431281210596149@1702055674618/BERT-base-uncased-model-architecture-which-comprises-12-transformer-block-layers-each.tif)

Documentation
- https://huggingface.co/google-bert/bert-base-uncased 
- https://huggingface.co/docs/transformers/en/model_doc/bert

### AutoTokenizer and Automodel
[MORE INFROMATION IS COMMING] 

Documentation
- https://huggingface.co/transformers/v3.0.2/model_doc/auto.html

## Load and Format Data
This dataset is from the AG-news dataset, which contains `Descriptions` and classifications for 5 different types of news called `Class Index` column. The dataset is being used for early testing until the project's main dataset is ready. We going to drop the `Title` column, because we are not using it.

In [9]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df = train_df.drop(['Title'], axis=1)
test_df = test_df.drop(['Title'], axis=1)
train_df.head()

Unnamed: 0,Class Index,Description
0,3,"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Reuters - Private investment firm Carlyle Grou...
2,3,Reuters - Soaring crude prices plus worries\ab...
3,3,Reuters - Authorities have halted oil export\f...
4,3,"AFP - Tearaway world oil prices, toppling reco..."


In [None]:
dataset = pd.read_csv('../data/filtered_events_class.csv')
dataset = dataset[['class', 'clean_notes']]

# Remove all rows with an class name NoN
dataset = dataset[dataset['class'] != 'NoN']

# print classes types
print(dataset['class'].value_counts())

class
labor rights                 28817
education                    19424
health care                   8508
palestine-israel conflict     7197
environment                   6515
climate                       5379
democracy                     3339
anti right wing               3329
police violence               2026
women rights                  1492
eviction                      1049
infrastructure                 927
immigration                    845
tourism                        506
skip                           145
anti crime                     119
youth violence                   9
Name: count, dtype: int64


In [36]:
Index_to_class = {
    1: 'labor rights',
    2: 'education',
    3: 'health care',
    4: 'palestine-israel conflict',
    5: 'environment',
    6: 'climate',
    7: 'democracy',
    8: 'anti right wing',
    9: 'police violence',
    10: 'women rights',
    11: 'eviction',
    12: 'infrastructure',
    13: 'immigration',
    14: 'skip',
    15: 'anti crime',
    16: 'youth violence',
    17: 'tourism',
}

class_to_index = {v: k for k, v in Index_to_class.items()}

# Convert the class names to indices
dataset['class'] = dataset['class'].map(class_to_index)
# Ensure the 'class' column is of type int
dataset['class'] = dataset['class'].astype(int)

# Split the dataset into train and test sets
train_df = dataset.sample(frac=0.8, random_state=42)
test_df = dataset.drop(train_df.index)

# count the number oof diffrent classes
class_counts = train_df['class'].value_counts()


len(dataset), len(train_df) , len(test_df), class_counts


(89626,
 71701,
 17925,
 class
 1     23032
 2     15453
 3      6844
 4      5797
 5      5214
 6      4297
 7      2713
 8      2659
 9      1641
 10     1188
 11      833
 12      723
 13      678
 17      411
 14      118
 15       93
 16        7
 Name: count, dtype: int64)

### Tokenizers

In [38]:
tokenizer_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

MAX_LEN_HF = 128

def tokenize_function(texts, tokenizer, max_len):
    return tokenizer(texts,
                     padding='max_length',
                     truncation=True,
                     max_length=max_len,
                     return_tensors='pt')


train_encodings = tokenize_function(train_df['clean_notes'].tolist(), tokenizer, MAX_LEN_HF)
test_encodings = tokenize_function(test_df['clean_notes'].tolist(), tokenizer, MAX_LEN_HF)


y_train_hf = torch.tensor((train_df['class'] - 1).values, dtype=torch.long)
y_test_hf = torch.tensor((test_df['class'] - 1).values, dtype=torch.long)

print("\nHugging Face Tokenizer - First training input_ids:", train_encodings['input_ids'][0])
print("Hugging Face Tokenizer - First training attention_mask:", train_encodings['attention_mask'][0])
print("Corresponding training label:", y_train_hf[0])
print(f"Max sequence length (HF): {MAX_LEN_HF}")
print(f"Vocabulary size (HF): {tokenizer.vocab_size}")


Hugging Face Tokenizer - First training input_ids: tensor([  101,  2655,  2310,  2099,  4487,  2105,  8148,  7309,  9813,  6186,
        19817,  4887, 15493, 21350,  3020, 11897,  2488,  2551,  4650, 22417,
        14558,  2066,  2769,  3071, 11421,  2724,  2112,  7289,  6186,  2929,
         3433,  2231, 10886,  3749,  3988, 19905,  2461,  2254,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            

### Creating Data loaders

In [39]:
class AGNewsDatasetHF(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset_hf = AGNewsDatasetHF(train_encodings, y_train_hf)
test_dataset_hf = AGNewsDatasetHF(test_encodings, y_test_hf)

print("\nHugging Face Dataset - First item:", train_dataset_hf[0])


Hugging Face Dataset - First item: {'input_ids': tensor([  101,  2655,  2310,  2099,  4487,  2105,  8148,  7309,  9813,  6186,
        19817,  4887, 15493, 21350,  3020, 11897,  2488,  2551,  4650, 22417,
        14558,  2066,  2769,  3071, 11421,  2724,  2112,  7289,  6186,  2929,
         3433,  2231, 10886,  3749,  3988, 19905,  2461,  2254,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,

In [40]:
batch_size = 16

train_loader_hf = DataLoader(train_dataset_hf, batch_size=batch_size, shuffle=True)
test_loader_hf = DataLoader(test_dataset_hf, batch_size=batch_size, shuffle=False)

print(f"\nHugging Face DataLoader - Number of batches in train_loader: {len(train_loader_hf)}")
for batch in train_loader_hf:
    print("Hugging Face DataLoader - Keys in batch:", batch.keys())
    print("Hugging Face DataLoader - Shape of input_ids:", batch['input_ids'].shape) # (batch_size, MAX_LEN_HF)
    print("Hugging Face DataLoader - Shape of labels:", batch['labels'].shape) # (batch_size,)
    break


Hugging Face DataLoader - Number of batches in train_loader: 4482
Hugging Face DataLoader - Keys in batch: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
Hugging Face DataLoader - Shape of input_ids: torch.Size([16, 128])
Hugging Face DataLoader - Shape of labels: torch.Size([16])


## Model

In [41]:
model_hf = AutoModelForSequenceClassification.from_pretrained(tokenizer_name, num_labels=16)
print("\nHugging Face Transformer Model:\n", model_hf)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Hugging Face Transformer Model:
 BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

### Training

In [42]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model_hf.to(device)

learning_rate_hf = 5e-5
num_epochs_hf = 3

optimizer_hf = AdamW(model_hf.parameters(), lr=learning_rate_hf)

print("\nStarting training for Hugging Face Transformer Model...")
for epoch in range(num_epochs_hf):
    model_hf.train()
    total_loss = 0
    for batch in tqdm(train_loader_hf, desc=f"Training Epoch {epoch+1}"):
        optimizer_hf.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_hf(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer_hf.step()

    avg_train_loss = total_loss / len(train_loader_hf)
    print(f"Epoch {epoch+1}: Average Training Loss: {avg_train_loss:.4f}")

print("Hugging Face Transformer Training finished.")


Starting training for Hugging Face Transformer Model...


Training Epoch 1: 100%|██████████| 4482/4482 [44:44<00:00,  1.67it/s]


Epoch 1: Average Training Loss: 0.1066


Training Epoch 2: 100%|██████████| 4482/4482 [1:19:34<00:00,  1.07s/it]   


Epoch 2: Average Training Loss: 0.0399


Training Epoch 3: 100%|██████████| 4482/4482 [7:42:01<00:00,  6.19s/it]     

Epoch 3: Average Training Loss: 0.0310
Hugging Face Transformer Training finished.





### Evaluation

In [43]:
model_hf.eval()
correct_predictions_hf = 0
total_predictions_hf = 0
with torch.no_grad():
    for batch in tqdm(test_loader_hf, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_hf(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        total_predictions_hf += labels.size(0)
        correct_predictions_hf += (predicted == labels).sum().item()

accuracy_hf = correct_predictions_hf / total_predictions_hf
print(f"Hugging Face Transformer Test Accuracy: {accuracy_hf:.4f}")

Evaluating: 100%|██████████| 1121/1121 [43:19<00:00,  2.32s/it]   

Hugging Face Transformer Test Accuracy: 0.9778





## Manage the Model

In [44]:
output_dir = "./ag_news_model_saved"

### Saving the Model

In [45]:
import os

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"Saving model to {output_dir}")
model_hf.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Model and tokenizer saved successfully!")



Saving model to ./ag_news_model_saved
Model and tokenizer saved successfully!


### Loading the Model

In [47]:
print(f"\n--- Loading the model from {output_dir} ---")
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)
loaded_model = AutoModelForSequenceClassification.from_pretrained(output_dir)

if torch.backends.mps.is_available():
    load_device = torch.device("mps")
else:
    load_device = torch.device("cpu")

loaded_model.to(load_device)

print("Model and tokenizer loaded successfully!")
print("Model architecture:", loaded_model)


--- Loading the model from ./ag_news_model_saved ---
Model and tokenizer loaded successfully!
Model architecture: BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
            

In [46]:
dataset = pd.read_csv('../data/filtered_events_class.csv')
dataset = dataset[['class', 'clean_notes']]

# Remove all rows with an class name NoN
dataset = dataset[dataset['class'] == 'NoN']

In [49]:
# predict the class of the first 10 rows
for i in range(200):
    text = dataset['clean_notes'].iloc[i]
    inputs = loaded_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LEN_HF)
    inputs = {key: val.to(load_device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item() + 1  # Adding 1 to match the original class index

    print(f"Text: {text}\nPredicted Class Index: {predicted_class}, Class Name: {Index_to_class[predicted_class]}\n")

Text: call nation corsican association patriot patriotti around 60 people gathered outside court justice bastia corse express solidarity 10 corsican nationalist activist arrested previous day paris suspicion carrying terrorist attack corsica 2021 2025 gathering began peacefully escalated clash police demonstrator set fire trash can lit flare participant condemned arrest politically motivated opposed corsican autonomy denounced described police brutality operation
Predicted Class Index: 2, Class Name: education

Text: call nation corsican association patriot patriotti around 60 people gathered outside court justice bastia corse express solidarity 10 corsican nationalist activist arrested previous day paris suspicion carrying terrorist attack corsica 2021 2025 gathering began peacefully escalated clash police demonstrator set fire trash can lit flare participant condemned arrest politically motivated opposed corsican autonomy denounced described police brutality operation
Predicted Class