<a href="https://colab.research.google.com/github/Umesh94kr/Speech-Emotion-Recognition/blob/main/FineTuning_Bert_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

In [None]:
df = pd.read_csv('ner_dataset.csv', encoding='latin1')

In [None]:
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [None]:
## Checking the columns which consist any NA values
df.isna().sum()

Sentence #    69406
Word              0
POS               0
Tag               0
dtype: int64

In [None]:
df = df.fillna(method='ffill')
df.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
72696,Sentence: 3299,well,RB,O
72697,Sentence: 3299,as,IN,O
72698,Sentence: 3299,biometric,JJ,O
72699,Sentence: 3299,voting,NN,O
72700,Sentence: 3299,cards,NNS,O
72701,Sentence: 3299,to,TO,O
72702,Sentence: 3299,prevent,VB,O
72703,Sentence: 3299,fraud,NN,O
72704,Sentence: 3299,.,.,O
72705,Sentence: 3300,In,IN,O


In [None]:
df.isna().sum()

Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64

In [None]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
sentence = SentenceGetter(df)

In [None]:
sentence.sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [None]:
sentences = [[word[0] for word in sentence] for sentence in sentence.sentences]
sentences[0]

['Thousands',
 'of',
 'demonstrators',
 'have',
 'marched',
 'through',
 'London',
 'to',
 'protest',
 'the',
 'war',
 'in',
 'Iraq',
 'and',
 'demand',
 'the',
 'withdrawal',
 'of',
 'British',
 'troops',
 'from',
 'that',
 'country',
 '.']

In [None]:
labels = [[s[2] for s in sentence] for sentence in sentence.sentences]
labels[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-geo',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-gpe',
 'O',
 'O',
 'O',
 'O',
 'O']

In [None]:
tag_values = list(set(df["Tag"].values))
tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}
print(tag2idx)

{'I-org': 0, 'B-tim': 1, 'B-geo': 2, 'I-geo': 3, 'B-gpe': 4, 'I-per': 5, 'I-eve': 6, 'B-eve': 7, 'I-art': 8, 'B-org': 9, 'I-tim': 10, 'B-nat': 11, 'O': 12, 'B-per': 13, 'I-gpe': 14, 'I-nat': 15, 'B-art': 16, 'PAD': 17}


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

torch.__version__

'2.1.0+cu121'

In [None]:
MAX_LEN = 75
batch_size = 32

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(sentence, text_labels):

        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)

        labels.extend([label] * n_subwords)
    return tokenized_sentence, labels

In [None]:
tokenized_text_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]

In [None]:
tokenized_text = [token[0] for token in tokenized_text_and_labels]
tokenized_labels = [token[1] for token in tokenized_text_and_labels]

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_text],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")


tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
input_ids

array([[26159,  1104,  8568, ...,     0,     0,     0],
       [ 7239,  3878,  1474, ...,     0,     0,     0],
       [ 1124,  8031,  4184, ...,     0,     0,     0],
       ...,
       [ 1109,  2095,  1108, ...,     0,     0,     0],
       [26159,  1104, 10271, ...,     0,     0,     0],
       [25793,  1104,  8943, ...,     0,     0,     0]])

In [None]:
tags

array([[12, 12, 12, ..., 17, 17, 17],
       [ 4, 12, 12, ..., 17, 17, 17],
       [12, 12,  1, ..., 17, 17, 17],
       ...,
       [12, 12, 12, ..., 17, 17, 17],
       [12, 12,  4, ..., 17, 17, 17],
       [12, 12, 12, ..., 17, 17, 17]])

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
## DataLoader
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

## Loading BERT ##

In [None]:
import transformers
from transformers import BertForTokenClassification, AdamW
transformers.__version__

'4.35.2'

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install peft
from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["query", "value"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_CLS, # this is necessary
 inference_mode=True
)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # see % trainable parameters

Collecting peft
  Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/168.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m163.8/168.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate, peft
Successfully installed accelerate-0.25.0 peft-0.7.1
trainable params: 13,842 || all params: 108,337,188 || trainable%: 0.012776776151878706


In [None]:
import torch.nn as nn

In [None]:
def calculate_rouge_1(pred_tokens, ref_tokens):
    # Flatten the lists of lists to lists
    pred_flat = [item for sublist in pred_tokens for item in sublist]
    ref_flat = [item for sublist in ref_tokens for item in sublist]

    # Convert the flattened lists to sets
    pred_set = set(pred_flat)
    ref_set = set(ref_flat)

    # Calculate ROUGE-1 precision
    if len(pred_set) > 0:
        precision = len(pred_set.intersection(ref_set)) / len(pred_set)
    else:
        precision = 0.0

    # Calculate ROUGE-1 recall
    if len(ref_set) > 0:
        recall = len(pred_set.intersection(ref_set)) / len(ref_set)
    else:
        recall = 0.0

    # Calculate ROUGE-1 F1-score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0.0

    return precision, recall, f1_score


In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertForTokenClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

# Define your BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(tag2idx))

# Define optimizer and learning rate scheduler
num_epochs=3
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

# Specify your device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch.nn.functional as F

def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, num_epochs, device):
    model.to(device)
    best_f1_score = 0.0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} - Training"):
            input_ids, attention_masks, labels = batch
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_masks)

            loss = F.cross_entropy(outputs.logits.view(-1, len(tag2idx)), labels.view(-1))
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            scheduler.step()

        average_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs} - Average Training Loss: {average_loss}")

        # Validation
        model.eval()
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} - Validation"):
                input_ids, attention_masks, labels = batch
                input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

                outputs = model(input_ids, attention_mask=attention_masks)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy().tolist())
                all_labels.extend(labels.cpu().numpy().tolist())

        # Calculate precision, recall, and F1-score
        precision, recall, f1 = calculate_rouge_1(all_labels, all_preds)

        print(f"Epoch {epoch + 1}/{num_epochs} - Micro F1 Score: {f1}")

        # Save the model if F1 score improves
        if f1 > best_f1_score:
            best_f1_score = f1
            torch.save(model.state_dict(), "best_model.pth")

# Train the model
train_model(model, train_dataloader, valid_dataloader, optimizer, scheduler, num_epochs, device)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3 - Training: 100%|██████████| 93/93 [00:35<00:00,  2.65it/s]


Epoch 1/3 - Average Training Loss: 0.5393580449845201


Epoch 1/3 - Validation: 100%|██████████| 11/11 [00:01<00:00,  8.78it/s]


Epoch 1/3 - Micro F1 Score: 0.3870967741935484


Epoch 2/3 - Training: 100%|██████████| 93/93 [00:34<00:00,  2.68it/s]


Epoch 2/3 - Average Training Loss: 0.2200297296688121


Epoch 2/3 - Validation: 100%|██████████| 11/11 [00:01<00:00,  8.35it/s]


Epoch 2/3 - Micro F1 Score: 0.3870967741935484


Epoch 3/3 - Training: 100%|██████████| 93/93 [00:35<00:00,  2.60it/s]


Epoch 3/3 - Average Training Loss: 0.18417506512775217


Epoch 3/3 - Validation: 100%|██████████| 11/11 [00:01<00:00,  8.09it/s]

Epoch 3/3 - Micro F1 Score: 0.3870967741935484





## Wihtout Fine-tuning ##

In [None]:
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertForTokenClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

# Define your BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(tag2idx))

# Define optimizer and learning rate scheduler
num_epochs=3
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * num_epochs)

# Specify your device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch.nn.functional as F

def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, num_epochs, device):
    model.to(device)
    best_f1_score = 0.0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} - Training"):
            input_ids, attention_masks, labels = batch
            input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_masks)

            loss = F.cross_entropy(outputs.logits.view(-1, len(tag2idx)), labels.view(-1))
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()
            scheduler.step()

        average_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs} - Average Training Loss: {average_loss}")

        # Validation
        model.eval()
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs} - Validation"):
                input_ids, attention_masks, labels = batch
                input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

                outputs = model(input_ids, attention_mask=attention_masks)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1)
                all_preds.extend(preds.cpu().numpy().tolist())
                all_labels.extend(labels.cpu().numpy().tolist())

        # Calculate precision, recall, and F1-score
        precision, recall, f1 = calculate_rouge_1(all_labels, all_preds)

        print(f"Epoch {epoch + 1}/{num_epochs} - Micro F1 Score: {f1}")

        # Save the model if F1 score improves
        if f1 > best_f1_score:
            best_f1_score = f1
            torch.save(model.state_dict(), "best_model.pth")

# Train the model
train_model(model, train_dataloader, valid_dataloader, optimizer, scheduler, num_epochs, device)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3 - Training: 100%|██████████| 93/93 [00:37<00:00,  2.51it/s]


Epoch 1/3 - Average Training Loss: 0.4878773668440439


Epoch 1/3 - Validation: 100%|██████████| 11/11 [00:01<00:00,  7.76it/s]


Epoch 1/3 - Micro F1 Score: 0.3870967741935484


Epoch 2/3 - Training: 100%|██████████| 93/93 [00:37<00:00,  2.48it/s]


Epoch 2/3 - Average Training Loss: 0.21360964191857204


Epoch 2/3 - Validation: 100%|██████████| 11/11 [00:01<00:00,  7.68it/s]


Epoch 2/3 - Micro F1 Score: 0.3870967741935484


Epoch 3/3 - Training: 100%|██████████| 93/93 [00:38<00:00,  2.43it/s]


Epoch 3/3 - Average Training Loss: 0.1828173573619576


Epoch 3/3 - Validation: 100%|██████████| 11/11 [00:01<00:00,  7.57it/s]

Epoch 3/3 - Micro F1 Score: 0.3870967741935484





In [None]:
import torch.nn as nn

In [None]:
test_sentence = """
Mr. Trump’s tweets began just moments after a Fox News report by Mike Tobin, a
reporter for the network, about protests in Minnesota and elsewhere.
"""

In [None]:
tokenized_sentence = tokenizer.encode(test_sentence)
input_ids = torch.tensor([tokenized_sentence])

In [None]:
input_ids = input_ids.to('cuda')
with torch.no_grad():
    output = model(input_ids)
label_indices = np.argmax(output[0].to('cpu').numpy(), axis=2)

In [None]:
# join bpe split tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)

In [None]:
for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(label, token))


O	[CLS]
I-per	Mr
O	.
O	Trump
O	’
O	s
O	tweets
O	began
O	just
O	moments
O	after
O	a
O	Fox
O	News
O	report
O	by
O	Mike
O	Tobin
O	,
O	a
O	reporter
O	for
O	the
O	network
O	,
O	about
O	protests
O	in
O	Minnesota
O	and
O	elsewhere
PAD	.
PAD	[SEP]
