In [1]:
import pandas as pd
import os

import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Hugging Face (Transformers) BERT Sequence Classification
This notebook utilizes the Hugging Face Transformers library to perform sequence classification using a pretrained bidirectional transformer on unlabeled data. One such model we use is called BERT. BERT is employed as both a tokenizer and a sequence classification model. This notebook also makes heavy use of `AutoTokenizer` and `AutoModel` to easily integrate the BERT model for this classification task.

## Detailed information

### BERT
In this notebook the `bert-bases-uncases` with 110M parameters from google is used. [MORE INFROMATION IS COMMING] 
- 12 transformer block layers
- Hidden size of 768
- linear layer and softmax

![Alt text for the image](https://www.researchgate.net/publication/374608193/figure/fig2/AS:11431281210596149@1702055674618/BERT-base-uncased-model-architecture-which-comprises-12-transformer-block-layers-each.tif)

Documentation
- https://huggingface.co/google-bert/bert-base-uncased 
- https://huggingface.co/docs/transformers/en/model_doc/bert

### AutoTokenizer and Automodel
[MORE INFROMATION IS COMMING] 

Documentation
- https://huggingface.co/transformers/v3.0.2/model_doc/auto.html

## Load and Format Data
This dataset is from the AG-news dataset, which contains `Descriptions` and classifications for 5 different types of news called `Class Index` column. The dataset is being used for early testing until the project's main dataset is ready. We going to drop the `Title` column, because we are not using it.

In [None]:
#train_df = pd.read_csv('train.csv')
#test_df = pd.read_csv('test.csv')

#train_df = train_df.drop(['Title'], axis=1)
#test_df = test_df.drop(['Title'], axis=1)
#train_df.head()

Unnamed: 0,Class Index,Description
0,3,"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Reuters - Private investment firm Carlyle Grou...
2,3,Reuters - Soaring crude prices plus worries\ab...
3,3,Reuters - Authorities have halted oil export\f...
4,3,"AFP - Tearaway world oil prices, toppling reco..."


In [50]:
dataset = pd.read_csv('../data/filtered_events_class.csv')
#dataset = dataset[['class', 'clean_notes']]

# Remove all rows with an class name NoN
#dataset = dataset[dataset['class'] != 'NoN']

# print classes types
print(dataset['class'].value_counts())

class
environment                             6000
Culture                                 6000
Education                               6000
Palestine-Israel Conflict               6000
Labor Rights                            6000
Public Services & Social Welfare        6000
Justice & Civil Rights                  6000
Climate Action & Animal Welfare         6000
Political & Democratic Governance       5911
Ukraine-Russia War                      5587
Infrastructure                          5427
Climate Action & Resource Management    4404
Name: count, dtype: int64


In [None]:
Index_to_class = {
    1: 'environment',
    2: 'Culture',
    3: 'Education',
    4: 'Palestine-Israel Conflict',
    5: 'Labor Rights',
    6: 'Public Services & Social Welfare',
    7: 'Justice & Civil Rights',
    8: 'Climate Action & Animal Welfare',
    9: 'Political & Democratic Governance',
    10: 'Ukraine-Russia War',
    11: 'Infrastructure',
    12: 'Climate Action & Resource Management',

}

class_to_index = {v: k for k, v in Index_to_class.items()}

dataset['class'] = dataset['class'].map(class_to_index)
dataset['class'] = dataset['class'].astype(int)

# Split the dataset into train and test sets
train_df = dataset.sample(frac=0.8, random_state=42)
test_df = dataset.drop(train_df.index)
class_counts = train_df['class'].value_counts()


len(dataset), len(train_df) , len(test_df), class_counts


(69329,
 55463,
 13866,
 class
 6     4830
 5     4815
 1     4807
 7     4803
 8     4795
 4     4791
 2     4775
 3     4759
 9     4699
 10    4513
 11    4323
 12    3553
 Name: count, dtype: int64)

### Tokenizers

In [52]:
tokenizer_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

MAX_LEN_HF = 128

def tokenize_function(texts, tokenizer, max_len):
    return tokenizer(texts,
                     padding='max_length',
                     truncation=True,
                     max_length=max_len,
                     return_tensors='pt')


train_encodings = tokenize_function(train_df['clean_notes'].tolist(), tokenizer, MAX_LEN_HF)
test_encodings = tokenize_function(test_df['clean_notes'].tolist(), tokenizer, MAX_LEN_HF)


y_train_hf = torch.tensor((train_df['class'] - 1).values, dtype=torch.long)
y_test_hf = torch.tensor((test_df['class'] - 1).values, dtype=torch.long)

print("\nHugging Face Tokenizer - First training input_ids:", train_encodings['input_ids'][0])
print("Hugging Face Tokenizer - First training attention_mask:", train_encodings['attention_mask'][0])
print("Corresponding training label:", y_train_hf[0])
print(f"Max sequence length (HF): {MAX_LEN_HF}")
print(f"Vocabulary size (HF): {tokenizer.vocab_size}")


Hugging Face Tokenizer - First training input_ids: tensor([  101,  5958,  2925,  7423,  7645,  4957, 17686,  9808,  3334,  3995,
        19270,  4009,  3086,  4483,  3277,  5157,  2895,  4785,  2689,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            

### Creating Data loaders

In [54]:
class AGNewsDatasetHF(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset_hf = AGNewsDatasetHF(train_encodings, y_train_hf)
test_dataset_hf = AGNewsDatasetHF(test_encodings, y_test_hf)

print("\nHugging Face Dataset - First item:", train_dataset_hf[0])


Hugging Face Dataset - First item: {'input_ids': tensor([  101,  5958,  2925,  7423,  7645,  4957, 17686,  9808,  3334,  3995,
        19270,  4009,  3086,  4483,  3277,  5157,  2895,  4785,  2689,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,

In [None]:
batch_size = 16

train_loader_hf = DataLoader(train_dataset_hf, batch_size=batch_size, shuffle=True)
test_loader_hf = DataLoader(test_dataset_hf, batch_size=batch_size, shuffle=False)

print(f"\nHugging Face DataLoader - Number of batches in train_loader: {len(train_loader_hf)}")
for batch in train_loader_hf:
    print("Hugging Face DataLoader - Keys in batch:", batch.keys())
    print("Hugging Face DataLoader - Shape of input_ids:", batch['input_ids'].shape)
    print("Hugging Face DataLoader - Shape of labels:", batch['labels'].shape)
    break


Hugging Face DataLoader - Number of batches in train_loader: 3467
Hugging Face DataLoader - Keys in batch: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
Hugging Face DataLoader - Shape of input_ids: torch.Size([16, 128])
Hugging Face DataLoader - Shape of labels: torch.Size([16])


## Model

In [56]:
model_hf = AutoModelForSequenceClassification.from_pretrained(tokenizer_name, num_labels=16)
print("\nHugging Face Transformer Model:\n", model_hf)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Hugging Face Transformer Model:
 BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (La

### Training

In [57]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model_hf.to(device)

learning_rate_hf = 5e-5
num_epochs_hf = 3

optimizer_hf = AdamW(model_hf.parameters(), lr=learning_rate_hf)

print("\nStarting training for Hugging Face Transformer Model...")
for epoch in range(num_epochs_hf):
    model_hf.train()
    total_loss = 0
    for batch in tqdm(train_loader_hf, desc=f"Training Epoch {epoch+1}"):
        optimizer_hf.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_hf(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer_hf.step()

    avg_train_loss = total_loss / len(train_loader_hf)
    print(f"Epoch {epoch+1}: Average Training Loss: {avg_train_loss:.4f}")

print("Hugging Face Transformer Training finished.")


Starting training for Hugging Face Transformer Model...


Training Epoch 1: 100%|██████████| 3467/3467 [3:18:08<00:00,  3.43s/it]    


Epoch 1: Average Training Loss: 0.1855


Training Epoch 2: 100%|██████████| 3467/3467 [5:29:33<00:00,  5.70s/it]     


Epoch 2: Average Training Loss: 0.0742


Training Epoch 3: 100%|██████████| 3467/3467 [35:24<00:00,  1.63it/s]

Epoch 3: Average Training Loss: 0.0598
Hugging Face Transformer Training finished.





### Evaluation

In [58]:
model_hf.eval()
correct_predictions_hf = 0
total_predictions_hf = 0
with torch.no_grad():
    for batch in tqdm(test_loader_hf, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model_hf(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        total_predictions_hf += labels.size(0)
        correct_predictions_hf += (predicted == labels).sum().item()

accuracy_hf = correct_predictions_hf / total_predictions_hf
print(f"Hugging Face Transformer Test Accuracy: {accuracy_hf:.4f}")

Evaluating: 100%|██████████| 867/867 [02:33<00:00,  5.63it/s]

Hugging Face Transformer Test Accuracy: 0.9830





## Manage the Model

In [4]:
output_dir = "../models/t5_E5_24_06/hf_transformer_model"

### Saving the Model

In [60]:
import os

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"Saving model to {output_dir}")
model_hf.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("Model and tokenizer saved successfully!")



Saving model to ./ag_news_model_saved
Model and tokenizer saved successfully!


### Loading the Model

In [5]:
print(f"\n--- Loading the model from {output_dir} ---")
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)
loaded_model = AutoModelForSequenceClassification.from_pretrained(output_dir)

if torch.backends.mps.is_available():
    load_device = torch.device("mps")
else:
    load_device = torch.device("cpu")

loaded_model.to(load_device)

print("Model and tokenizer loaded successfully!")
print("Model architecture:", loaded_model)


--- Loading the model from ../models/t5_E5_24_06/hf_transformer_model ---
Model and tokenizer loaded successfully!
Model architecture: T5ForSequenceClassification(
  (transformer): T5Model(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDen

In [12]:
#dataset = pd.read_csv('../../data/filtered_events_class.csv')
#dataset = dataset[['class', 'clean_notes']]

# Remove all rows with an class name NoN
#dataset = dataset[dataset['class'] == 'unknown']

dataset = pd.read_csv('../../data/labeled.csv')
#dataset = dataset[['class', 'clean_notes']]

# Remove all rows with an class name NoN
dataset = dataset[dataset['class'] == 'unknown']
len(dataset)

  dataset = pd.read_csv('../../data/labeled.csv')


72984

In [14]:
Index_to_class = {
    1: 'animal welfare',
    2: 'blm',
    3: 'climate',
    4: 'culture',
    5: 'discrimination',
    6: 'education',
    7: 'environment',
    8: 'farmers',
    9: 'health care',
    10: 'housing',
    11: 'immigration',
    12: 'labor rights',
    13: 'lgbtq',
    14: 'palestine-israel conflict',
    15: 'pandemic',
    16: 'policies & politics',
    17: 'public services',
    18: 'ukraine-russia war',
    19: 'unjust law enforcement',
    20: 'women rights',

}

class_to_index = {v: k for k, v in Index_to_class.items()}

In [15]:
unkown_data = []

# an list with 700 randmom indexes from the dataset
import random
random_indexes = random.sample(range(len(dataset)), 700)

In [16]:
MAX_LEN_HF = 128
for i in random_indexes:
    text = dataset['clean_notes'].iloc[i]
    onfiltert_text = dataset['notes'].iloc[i]
    inputs = loaded_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LEN_HF)
    inputs = {key: val.to(load_device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item() + 1  

    unkown_data.append([onfiltert_text, Index_to_class[predicted_class], dataset['class'].iloc[i]])
    #print(f"Text: {onfiltert_text}\nPredicted Class Index: {predicted_class}, Class Name: {Index_to_class[predicted_class]}\n")

new_df = pd.DataFrame(unkown_data, columns=['notes', 'class', 'orginal_class'])
new_df.to_csv('../../data/unknows_labeled.csv', index=False)

In [14]:
dataset = pd.read_csv('../../data/filtered_events_class.csv')

# Add an column with the class name for predicated classes
dataset['predicted_class'] = ""

  dataset = pd.read_csv('../../data/filtered_events_class.csv')


In [15]:
dataset.head()

Unnamed: 0,event_id_cnty,event_date,year,time_precision,disorder_type,event_type,sub_event_type,actor1,assoc_actor_1,inter1,...,source_scale,notes,fatalities,tags,timestamp,population_best,clean_notes,country_code,class,predicted_class
0,BEL4179,23 May 2025,2025,1,Demonstrations,Protests,Peaceful protest,Protesters (Belgium),Government of Belgium (2025-); Green,6,...,Subnational,"On 23 May 2025, in the afternoon, at the call ...",0,crowd size=several,1748377529,2753.0,afternoon call green west flanders groen west ...,BEL,environment,
1,BEL4183,23 May 2025,2025,1,Demonstrations,Protests,Peaceful protest,Protesters (Belgium),XR: Extinction Rebellion,6,...,Other-National,"On 23 May 2025, XR protested at the Farmers Un...",0,crowd size=no report,1748377529,10978.0,xr protested farmer union bb office leuven vla...,BEL,farmers,
2,BGR4378,23 May 2025,2025,1,Demonstrations,Protests,Peaceful protest,Protesters (Bulgaria),Labor Group (Bulgaria),6,...,National,"On 23 May 2025, librarians and representatives...",0,crowd size=no report,1748377530,194.0,librarian representative cultural institution ...,BGR,labor rights,
3,BGR4379,23 May 2025,2025,1,Demonstrations,Protests,Peaceful protest,Protesters (Bulgaria),Labor Group (Bulgaria),6,...,National,"On 23 May 2025, librarians and representatives...",0,crowd size=no report,1748377530,7324.0,librarian representative cultural institution ...,BGR,labor rights,
4,BGR4380,23 May 2025,2025,1,Demonstrations,Protests,Peaceful protest,Protesters (Bulgaria),Labor Group (Bulgaria),6,...,National,"On 23 May 2025, librarians and representatives...",0,crowd size=no report,1748377530,1502.0,librarian representative cultural institution ...,BGR,labor rights,


In [18]:
for i in range(len(dataset)):
    text = dataset['clean_notes'].iloc[i]
    inputs = loaded_tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=MAX_LEN_HF)
    inputs = {key: val.to(load_device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item() + 1  

    dataset.at[i, 'predicted_class'] = Index_to_class[predicted_class]
dataset.to_csv('../../data/filtered_events_class_with_predicted.csv', index=False)
    