In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import numpy as np
import re
from bs4 import BeautifulSoup
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report




In [3]:
# !pip install seqeval
# !pip install datasets
# !pip install transformers[torch]
# !pip install accelerate -U

In [4]:

from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# Load the dataset
df = pd.read_csv('drive/My Drive/ner_dataset.csv', encoding='latin1')


# Fill NaN values
df = df.fillna(method='ffill')

# Convert words and tags to list
words = df.groupby('Sentence #')['Word'].apply(list).values
ner_tags = df.groupby('Sentence #')['Tag'].apply(list).values


# Ensure all elements in sentences are strings and remove leading/trailing whitespace
words = [[str(word).strip() if not isinstance(word, str) else word.strip() for word in sentence] for sentence in words]




In [6]:
# identifiying spaces before punctuation and removal --
# remove extra spaces and tabs
words = [
    [re.sub(r'\s+([?.!"])', r'\1', word) for word in sublist]
    for sublist in words
]

# Apply the second regex to each word within each sentence
words= [
    [re.sub(r'\s+', ' ', word) for word in sublist]
    for sublist in words
]

In [7]:
# Create a DataFrame for training
train_df = pd.DataFrame({'sentence': words, 'ner_tags': ner_tags})
train_df

Unnamed: 0,sentence,ner_tags
0,"[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
1,"[Iranian, officials, say, they, expect, to, ge...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
2,"[Helicopter, gunships, Saturday, pounded, mili...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
3,"[They, left, after, a, tense, hour-long, stand...","[O, O, O, O, O, O, O, O, O, O, O]"
4,"[U.N., relief, coordinator, Jan, Egeland, said...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."
...,...,...
47954,"[Opposition, leader, Mir, Hossein, Mousavi, ha...","[O, O, O, B-per, I-per, O, O, O, O, O, O, O, O..."
47955,"[On, Thursday, ,, Iranian, state, media, publi...","[O, B-tim, O, B-gpe, O, O, O, O, O, O, O, O, B..."
47956,"[Following, Iran, 's, disputed, June, 12, elec...","[O, B-geo, O, O, B-tim, I-tim, O, O, O, O, O, ..."
47957,"[Since, then, ,, authorities, have, held, publ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [8]:



# Split the dataset into train and test
from sklearn.model_selection import train_test_split

# Split the dataset into train, validation, and test sets
train_df, temp_df = train_test_split(train_df, test_size=0.2)
val_df, test_df = train_test_split(temp_df, test_size=0.5)

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)


# Define label mapping
unique_tags = df['Tag'].unique()
label2id = {tag: id for id, tag in enumerate(unique_tags)}
id2label = {id: tag for tag, id in label2id.items()}

# Convert tags to IDs
train_df['ner_tags'] = train_df['ner_tags'].apply(lambda x: [label2id[tag] for tag in x])
val_df['ner_tags'] = val_df['ner_tags'].apply(lambda x: [label2id[tag] for tag in x])
test_df['ner_tags'] = test_df['ner_tags'].apply(lambda x: [label2id[tag] for tag in x])

# recreate the datasets with converted labels
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)


In [9]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english", use_fast=True)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        is_split_into_words=True
    )

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label[word_idx] != label2id['O'] else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the function to the datasets
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# Validate data
def validate_data(dataset):
    for i, example in enumerate(dataset):
        assert 'labels' in example, f"Labels missing in example {i}"
        assert len(example['input_ids']) == len(example['labels']), f"Input and label length mismatch in example {i}"
        if i % 1000 == 0:
            print(f"Validated {i} examples")

validate_data(train_dataset)
validate_data(val_dataset)
validate_data(test_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/38367 [00:00<?, ? examples/s]

Map:   0%|          | 0/4796 [00:00<?, ? examples/s]

Map:   0%|          | 0/4796 [00:00<?, ? examples/s]

Validated 0 examples
Validated 1000 examples
Validated 2000 examples
Validated 3000 examples
Validated 4000 examples
Validated 5000 examples
Validated 6000 examples
Validated 7000 examples
Validated 8000 examples
Validated 9000 examples
Validated 10000 examples
Validated 11000 examples
Validated 12000 examples
Validated 13000 examples
Validated 14000 examples
Validated 15000 examples
Validated 16000 examples
Validated 17000 examples
Validated 18000 examples
Validated 19000 examples
Validated 20000 examples
Validated 21000 examples
Validated 22000 examples
Validated 23000 examples
Validated 24000 examples
Validated 25000 examples
Validated 26000 examples
Validated 27000 examples
Validated 28000 examples
Validated 29000 examples
Validated 30000 examples
Validated 31000 examples
Validated 32000 examples
Validated 33000 examples
Validated 34000 examples
Validated 35000 examples
Validated 36000 examples
Validated 37000 examples
Validated 38000 examples
Validated 0 examples
Validated 1000 ex

In [10]:
# Load the model with ignore_mismatched_sizes=True
model = AutoModelForTokenClassification.from_pretrained(
    "dbmdz/bert-large-cased-finetuned-conll03-english",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)



training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,  # Slightly higher learning rate
    per_device_train_batch_size=16,  # Larger batch size
    per_device_eval_batch_size=8,  # Larger eval batch size
    num_train_epochs=2,  # Fewer epochs
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=20,  # More frequent logging
    save_total_limit=1,  # Save only the best model
    save_steps=1000,  # Less frequent model saving
    eval_steps=500,  # More frequent validation
)

# Data collator for padding
data_collator = DataCollatorForTokenClassification(tokenizer)


# This is created to include accuracy matrices in Trainer that will return values post training
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Remove ignored index (special tokens)
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [[id2label[p] for (p, l) in zip(pred, label) if l != -100] for pred, label in zip(preds, labels)]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)
    report = classification_report(true_labels, true_predictions)

    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'classification_report': report
    }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

# Train the model
trainer.train()


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([17]) in the model instanti

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)


In [None]:
# Function to predict NER tags using the BERT model
def predict_ner_bert(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=True)
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=-1).tolist()[0]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].tolist()[0])
    predicted_labels = [id2label[pred] for pred in predictions]
    return list(zip(tokens, predicted_labels))

# Predict NER tags using BERT
bert_predictions = predict_ner_bert(sentence)
print("BERT Predictions:")
for token, label in bert_predictions:
    print(f"Token: {token}, Label: {label}")