In [2]:
import pandas as pd
import torch

In [3]:
def keep_first_row(group):
    return group.iloc[0]

In [4]:
df = pd.read_csv("dataset/ner_dataset.csv",encoding="unicode_escape")

df['Sentence #'].ffill(inplace=True)

class_labels = {value:key for key,value in enumerate(list(df['Tag'].unique()))}

In [5]:
class_labels

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16}

In [6]:
df['Tag'] = df.Tag.map(class_labels)

In [7]:
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,0
1,Sentence: 1,of,IN,0
2,Sentence: 1,demonstrators,NNS,0
3,Sentence: 1,have,VBP,0
4,Sentence: 1,marched,VBN,0
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,0
1048571,Sentence: 47959,responded,VBD,0
1048572,Sentence: 47959,to,TO,0
1048573,Sentence: 47959,the,DT,0


In [8]:
# df['sentence'] = df.groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
df['sentence'] = df.groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(str(word) for word in x))
df['tokens'] = df['sentence'].apply(lambda x: x.split())
df.drop("sentence",axis=1,inplace=True)

df['tags'] = df.groupby(['Sentence #'])['Tag'].transform(lambda x: ' '.join(str(tag) for tag in x))
df['ner_tags'] = df['tags'].apply(lambda x: x.split())
df.drop("tags",axis=1,inplace=True)

df.drop(["Word","POS","Tag"],axis=1,inplace=True)
df['Sentence #'] = df['Sentence #'].apply(lambda x: x.split(" ")[-1])
df.rename(columns={"Sentence #":"id"},inplace=True)

df = df.groupby('id').apply(keep_first_row)

In [9]:
df.drop('id',axis=1,inplace=True)
df.reset_index(inplace=True)

In [10]:
from datasets import Dataset,DatasetDict,ClassLabel
from sklearn.model_selection import train_test_split

In [11]:
train_df, test_df = train_test_split(df, test_size=0.25, random_state=42)

# Split the train set into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

train_dataset = Dataset.from_pandas(train_df,preserve_index=False)
val_dataset = Dataset.from_pandas(val_df,preserve_index=False)
test_dataset = Dataset.from_pandas(test_df,preserve_index=False)

dataset = DatasetDict()
dataset['train'] = train_dataset
dataset['validation'] = val_dataset
dataset['test'] = test_dataset

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 26976
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 8993
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 11990
    })
})

In [13]:
dataset['train'][0]

{'id': '41557',
 'tokens': ['The',
  'researchers',
  'say',
  'western',
  'antarctica',
  'lost',
  '132',
  'billion',
  'tons',
  'of',
  'ice',
  'in',
  '2006',
  ',',
  'enough',
  'to',
  'raise',
  'worldwide',
  'sea',
  'levels',
  'by',
  '0.5',
  'millimeter',
  '.'],
 'ner_tags': ['0',
  '0',
  '0',
  '0',
  '1',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '7',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0',
  '0']}

In [14]:
ner_labels_map = {class_labels[key]:key for key in class_labels}
ner_labels_map

{0: 'O',
 1: 'B-geo',
 2: 'B-gpe',
 3: 'B-per',
 4: 'I-geo',
 5: 'B-org',
 6: 'I-org',
 7: 'B-tim',
 8: 'B-art',
 9: 'I-art',
 10: 'I-per',
 11: 'I-gpe',
 12: 'I-tim',
 13: 'B-nat',
 14: 'B-eve',
 15: 'I-eve',
 16: 'I-nat'}

In [15]:
def get_labels(indexes,ner_labels_map=None):
    return [ner_labels_map[int(i)] for i in indexes]

In [16]:
get_labels(indexes=dataset['train']['ner_tags'][3],ner_labels_map=ner_labels_map)

['B-per',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-gpe',
 'O',
 'O',
 'O',
 'B-org',
 'I-org',
 'O',
 'O',
 'B-org',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'O',
 'B-geo',
 'O']

# Tokenizer

In [17]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [18]:
tokenizer.is_fast

True

In [19]:
dataset["train"][0]["tokens"]

['The',
 'researchers',
 'say',
 'western',
 'antarctica',
 'lost',
 '132',
 'billion',
 'tons',
 'of',
 'ice',
 'in',
 '2006',
 ',',
 'enough',
 'to',
 'raise',
 'worldwide',
 'sea',
 'levels',
 'by',
 '0.5',
 'millimeter',
 '.']

In [20]:
inputs = tokenizer(dataset["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'the',
 'researchers',
 'say',
 'western',
 'antarctica',
 'lost',
 '132',
 'billion',
 'tons',
 'of',
 'ice',
 'in',
 '2006',
 ',',
 'enough',
 'to',
 'raise',
 'worldwide',
 'sea',
 'levels',
 'by',
 '0',
 '.',
 '5',
 'mill',
 '##imeter',
 '.',
 '[SEP]']

In [21]:
inputs.word_ids()

[None,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 21,
 21,
 22,
 22,
 23,
 None]

In [22]:
def align_labels_with_tokens(labels, word_ids):
    # word_ids = [int(x) for x in word_ids]
    new_labels = []
    current_word = None
    # [None, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 17, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, None]
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            try:
                label = -100 if word_id is None else labels[word_id]
            except:
                print(f"Value of labels: {labels} and value of word_id: {word_id}, word_ids:{word_ids}")
                break
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = int(labels[word_id])
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [23]:
labels = dataset["train"][1]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(word_ids)
print(align_labels_with_tokens(labels, word_ids))

['0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '0', '0', '0', '0', '0', '0']
[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 21, 21, 22, 22, 23, None]
[-100, '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '0', 0, 0, '0', 0, '0', -100]


In [24]:
len(labels),len(word_ids),len(align_labels_with_tokens(labels, word_ids))

(27, 29, 29)

In [25]:
def tokenize_and_align_lables(examples):
    tokenized_inputs = tokenizer(examples['tokens'],truncation=True,is_split_into_words=True)
    all_labels = examples['ner_tags']
    new_labels = []
    for i,labels in enumerate(all_labels):
        labels = [int(x) for x in labels]
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(word_ids=word_ids,labels=labels))

    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs
        

In [26]:
tokenized_datasets = dataset.map(tokenize_and_align_lables,batched=True,remove_columns=dataset['train'].column_names)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/26976 [00:00<?, ? examples/s]

Value of labels: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] and value of word_id: 29, word_ids:[None, 0, 0, 0, 0, 1, 2, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 17, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, None]


Map:   0%|          | 0/8993 [00:00<?, ? examples/s]

Map:   0%|          | 0/11990 [00:00<?, ? examples/s]

In [27]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 26976
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8993
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11990
    })
})

In [28]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [29]:
data_collator([tokenized_datasets['train'][i] for i in range(2)])

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  1996,  6950,  2360,  2530, 12615,  2439, 14078,  4551,  6197,
          1997,  3256,  1999,  2294,  1010,  2438,  2000,  5333,  4969,  2712,
          3798,  2011,  1014,  1012,  1019,  4971, 19198,  1012,   102,     0,
             0,     0,     0],
        [  101,  4584,  1999,  6520, 12322,  5833,  2072,  2360,  2048,  1997,
          2037,  3548,  2020,  2730,  1998,  2012,  2560,  2538,  2500,  5303,
          1999,  1037,  2645,  2007, 26040,  2078,  2749,  2006,  2037, 11621,
          3675,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [30]:
labels

['0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '2',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0']

In [31]:
label_names = list(class_labels.keys())
label_names

['O',
 'B-geo',
 'B-gpe',
 'B-per',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-art',
 'I-art',
 'I-per',
 'I-gpe',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-nat']

In [32]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[class_labels[int(l)] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [33]:
ner_labels_map

{0: 'O',
 1: 'B-geo',
 2: 'B-gpe',
 3: 'B-per',
 4: 'I-geo',
 5: 'B-org',
 6: 'I-org',
 7: 'B-tim',
 8: 'B-art',
 9: 'I-art',
 10: 'I-per',
 11: 'I-gpe',
 12: 'I-tim',
 13: 'B-nat',
 14: 'B-eve',
 15: 'I-eve',
 16: 'I-nat'}

In [34]:
class_labels

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'B-art': 8,
 'I-art': 9,
 'I-per': 10,
 'I-gpe': 11,
 'I-tim': 12,
 'B-nat': 13,
 'B-eve': 14,
 'I-eve': 15,
 'I-nat': 16}

In [49]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=ner_labels_map,
    label2id=class_labels,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [51]:
model = model.to(device)

In [52]:
model.config.num_labels

17

In [53]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [47]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"][:100],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [58]:
tokenized_datasets["train"][0]

{'input_ids': [101,
  1996,
  6950,
  2360,
  2530,
  12615,
  2439,
  14078,
  4551,
  6197,
  1997,
  3256,
  1999,
  2294,
  1010,
  2438,
  2000,
  5333,
  4969,
  2712,
  3798,
  2011,
  1014,
  1012,
  1019,
  4971,
  19198,
  1012,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'labels': [-100,
  0,
  0,
  0,
  0,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  7,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  -100]}

In [48]:
trainer.train()

  0%|          | 0/3 [00:00<?, ?it/s]

KeyError: 0