In [1]:
import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import random
import json


### Read the data

In [2]:
data = pd.read_csv("../data/ner.csv", encoding='cp1252', on_bad_lines='warn')
data

Skipping line 281837: expected 25 fields, saw 34



Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,...,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,...,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1.0,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,...,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1.0,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,...,thousand,NNS,capitalized,Thousands,lowercase,of,1.0,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,...,of,IN,lowercase,of,lowercase,demonstrators,1.0,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,...,demonstr,NNS,lowercase,demonstrators,lowercase,have,1.0,lowercase,marched,O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050790,1048570,they,respond,to,TO,lowercase,to,VBD,lowercase,responded,...,forc,NNS,lowercase,forces,lowercase,said,47959.0,lowercase,they,O
1050791,1048571,respond,to,the,DT,lowercase,the,TO,lowercase,to,...,said,VBD,lowercase,said,lowercase,they,47959.0,lowercase,responded,O
1050792,1048572,to,the,attack,NN,lowercase,attack,DT,lowercase,the,...,they,PRP,lowercase,they,lowercase,responded,47959.0,lowercase,to,O
1050793,1048573,the,attack,with,IN,lowercase,with,NN,lowercase,attack,...,respond,VBD,lowercase,responded,lowercase,to,47959.0,lowercase,the,O


In [3]:
data.sentence_idx.value_counts()

2549.0     140
11994.0    132
608.0      124
5805.0     122
6344.0     120
          ... 
37093.0      2
8412.0       2
39874.0      2
40249.0      2
38917.0      1
Name: sentence_idx, Length: 35177, dtype: int64

In [4]:
sentences = data.sentence_idx.value_counts().index[:20000]
len(sentences)

20000

In [5]:
new_data = data[data['sentence_idx'].isin(sentences)]
new_data.head()

Unnamed: 0.1,Unnamed: 0,lemma,next-lemma,next-next-lemma,next-next-pos,next-next-shape,next-next-word,next-pos,next-shape,next-word,...,prev-prev-lemma,prev-prev-pos,prev-prev-shape,prev-prev-word,prev-shape,prev-word,sentence_idx,shape,word,tag
0,0,thousand,of,demonstr,NNS,lowercase,demonstrators,IN,lowercase,of,...,__start2__,__START2__,wildcard,__START2__,wildcard,__START1__,1.0,capitalized,Thousands,O
1,1,of,demonstr,have,VBP,lowercase,have,NNS,lowercase,demonstrators,...,__start1__,__START1__,wildcard,__START1__,capitalized,Thousands,1.0,lowercase,of,O
2,2,demonstr,have,march,VBN,lowercase,marched,VBP,lowercase,have,...,thousand,NNS,capitalized,Thousands,lowercase,of,1.0,lowercase,demonstrators,O
3,3,have,march,through,IN,lowercase,through,VBN,lowercase,marched,...,of,IN,lowercase,of,lowercase,demonstrators,1.0,lowercase,have,O
4,4,march,through,london,NNP,capitalized,London,IN,lowercase,through,...,demonstr,NNS,lowercase,demonstrators,lowercase,have,1.0,lowercase,marched,O


In [6]:
new_data.tag.value_counts()

O        668694
B-geo     28499
B-org     15345
B-tim     14932
I-per     13417
I-org     13218
B-per     12708
B-gpe     12464
I-geo      5904
I-tim      4765
B-art       348
B-eve       280
I-eve       234
I-art       226
I-gpe       188
B-nat       188
I-nat        65
Name: tag, dtype: int64

In [7]:
labels = new_data.tag.value_counts().index
labels

Index(['O', 'B-geo', 'B-org', 'B-tim', 'I-per', 'I-org', 'B-per', 'B-gpe',
       'I-geo', 'I-tim', 'B-art', 'B-eve', 'I-eve', 'I-art', 'I-gpe', 'B-nat',
       'I-nat'],
      dtype='object')

In [10]:
label_maps = {}
c=0
for k in labels:
    label_maps[k] = c
    c+=1
    
print(label_maps)

{'O': 0, 'B-geo': 1, 'B-org': 2, 'B-tim': 3, 'I-per': 4, 'I-org': 5, 'B-per': 6, 'B-gpe': 7, 'I-geo': 8, 'I-tim': 9, 'B-art': 10, 'B-eve': 11, 'I-eve': 12, 'I-art': 13, 'I-gpe': 14, 'B-nat': 15, 'I-nat': 16}


In [9]:
# save label_map
label_maps = json.dumps(label_maps)
with open("label_maps.json", 'w') as f:
    f.write(label_maps)

In [13]:
# read back label_maps

In [11]:
# parsing data 
data_json = []
for ind, i in enumerate(sentences):
    ner_tags = []
    tokens = []
    sub_data = new_data[new_data['sentence_idx']==i]
    for j in sub_data.index:
        tokens.append(sub_data.loc[j,'word'])
        ner_tag = label_maps[sub_data.loc[j,'tag']]
        ner_tags.append(ner_tag)
        
    data = {'id':ind, 'ner_tags':ner_tags, 'tokens':tokens}
    data_json.append(data)

In [12]:
print(data_json[5])

{'id': 5, 'ner_tags': [0, 0, 0, 0, 11, 12, 0, 3, 9, 9, 0, 0, 0, 0, 1, 8, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 1, 0, 0, 0, 0, 0, 6, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 0, 3, 9, 9, 0, 0, 0, 0, 1, 8, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 8, 0, 1, 0, 0, 0, 0, 0, 6, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['After', 'failing', 'in', 'the', 'Korean', 'War', '(', '1950', '-', '53', ')', 'to', 'conquer', 'the', 'US-backed', 'Republic', 'of', 'Korea', '(', 'ROK', ')', 'in', 'the', 'southern', 'portion', 'by', 'force', ',', 'North', 'Korea', '(', 'DPRK', ')', ',', 'under', 'its', 'founder', 'President', 'KIM', 'Il', 'Sung', ',', 'adopted', 'a', 'policy', 'of', 'ostensible', 'diplomatic', 'and', 'economic', '"', 'self-reliance', '"', 'as', 'a', 'check', 'against', 'outside', 'influence', '.', 'After', 'failing', 'in', 'the', 'Korean', 'War', '(', '1950', '-', '53', ')', 'to', 'conquer', 'the', 'US-backe

### Train Val Test splitting

In [13]:
def train_val_test_split(data, split_size=0.1):
    random.shuffle(data)
    total_size = len(data)
    test_size = val_size = int(total_size*split_size)
    test_indices = random.sample(range(total_size), test_size)
    test_data = [ele for i, ele in enumerate(data) if i in test_indices]
    train_data = [ele for i, ele in enumerate(data) if i not in test_indices]
    total_size = len(train_data)
    val_indices = random.sample(range(total_size), val_size)
    val_data = [ele for i, ele in enumerate(train_data) if i in val_indices]
    train_data = [ele for i, ele in enumerate(train_data) if i not in val_indices]
    return train_data, val_data, test_data

In [14]:
train, val, test = train_val_test_split(data_json, split_size=0.1)
print(train[:3])

[{'id': 18077, 'ner_tags': [0, 0, 0, 0, 0, 6, 4, 4, 4, 0, 0, 0, 1, 0, 1, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['Unofficial', 'results', 'show', 'ruling', 'party', 'Prime', 'Minister', 'Jose', 'Ramos-Horta', 'and', 'parliament', 'chief', 'Francisco', '"', 'Lu', 'Olo', '"', 'Guterres', 'will', 'be', 'competing', 'in', 'a', 'runoff', 'election', '.']}, {'id': 16743, 'ner_tags': [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 4, 0, 0, 0, 0, 0, 0], 'tokens': ['The', 'attacks', 'came', 'as', 'negotiators', 'from', "Shi'ite", 'and', 'Kurdish', 'factions', 'that', 'dominate', 'parliament', 'continued', 'talks', 'aimed', 'at', 'winning', 'last-minute', 'Sunni', 'Arab', 'backing', 'for', 'the', 'draft', 'constitution', '.']}, {'id': 13280, 'ner_tags': [6, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['President', 'Mesa', 'submitted', 'his', 'resignation', 'Monday', ',', 'saying', 'he', 'could', 'no', 'longer', 'lead', 'the'

In [15]:
# json_dump
train = json.dumps(train)
val = json.dumps(val)
test = json.dumps(test)

In [16]:
for file, d  in zip(['train', 'val', 'test'], [train, val, test]):
    with open(f"../data/huggingface/{file}.json", 'w') as f:
        f.write(d)

In [17]:
# read back data
with open(f"../data/huggingface/train.json", 'r') as f:
    train = json.load(f)
with open(f"../data/huggingface/val.json", 'r') as f:
    val = json.load(f)
with open(f"../data/huggingface/test.json", 'r') as f:
    test = json.load(f)

In [18]:
print(train[2])

{'id': 13280, 'ner_tags': [6, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['President', 'Mesa', 'submitted', 'his', 'resignation', 'Monday', ',', 'saying', 'he', 'could', 'no', 'longer', 'lead', 'the', 'poor', 'Andean', 'nation', 'in', 'the', 'face', 'of', 'continuing', 'large', 'demonstrations', 'over', 'his', 'government', "'s", 'policies', '.']}


### Inspecting required dataset format by dowlodaing and loading a dummy dataset

In [19]:
from datasets import load_dataset

wnut = load_dataset("wnut_17")
wnut

Found cached dataset wnut_17 (C:/Users/ritap/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3394
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1009
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 1287
    })
})

In [20]:
print(wnut["train"])

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 3394
})


In [21]:
print(wnut["train"][2])

{'id': '2', 'tokens': ['Pxleyes', 'Top', '50', 'Photography', 'Contest', 'Pictures', 'of', 'August', '2010', '...', 'http://bit.ly/bgCyZ0', '#photography'], 'ner_tags': [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


### Our dataset is a list of dictionaries, so we have to use Dataset.from_list(data) for typecasting

In [22]:
from datasets import Dataset

In [23]:
print(dir(Dataset))

['_TF_DATASET_REFS', '__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getitems__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_build_local_temp_path', '_check_index_is_initialized', '_estimate_nbytes', '_generate_examples_from_shards', '_get_cache_file_path', '_get_output_signature', '_getitem', '_map_single', '_new_dataset_with_indices', '_push_parquet_shards_to_hub', '_save_to_disk_single', '_select_contiguous', '_select_with_indices_mapping', 'add_column', 'add_elasticsearch_index', 'add_faiss_index', 'add_faiss_index_from_external_arrays', 'add_item', 'align_labels_with_mapping', 'builder_name', 'cache_files', 'cast', 'cast_column', 'citation', 'class_encode

In [25]:
train = Dataset.from_list(train)
test = Dataset.from_list(test)
print(train)

Dataset({
    features: ['id', 'ner_tags', 'tokens'],
    num_rows: 16000
})


In [26]:
print(train[2])

{'id': 13280, 'ner_tags': [6, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['President', 'Mesa', 'submitted', 'his', 'resignation', 'Monday', ',', 'saying', 'he', 'could', 'no', 'longer', 'lead', 'the', 'poor', 'Andean', 'nation', 'in', 'the', 'face', 'of', 'continuing', 'large', 'demonstrations', 'over', 'his', 'government', "'s", 'policies', '.']}


### Tokenization

In [27]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") # use the same tokenizer as our model architecture

In [31]:
example = train[2] # taking a random example for inspection
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)

In [32]:
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'president', 'mesa', 'submitted', 'his', 'resignation', 'monday', ',', 'saying', 'he', 'could', 'no', 'longer', 'lead', 'the', 'poor', 'and', '##ean', 'nation', 'in', 'the', 'face', 'of', 'continuing', 'large', 'demonstrations', 'over', 'his', 'government', "'", 's', 'policies', '.', '[SEP]']


### As mentioned in huggingface documentation, I am pasting exactly what they said:-
##### However, this adds some special tokens [CLS] and [SEP] and the subword tokenization creates a mismatch between the input and labels. A single word corresponding to a single label may now be split into two subwords. You’ll need to realign the tokens and labels by:

##### Mapping all tokens to their corresponding word with the word_ids method.
##### Assigning the label -100 to the special tokens [CLS] and [SEP] so they’re ignored by the PyTorch loss function.
##### Only labeling the first token of a given word. Assign -100 to other subtokens from the same word.
##### Here is how you can create a function to realign the tokens and labels, and truncate sequences to be no longer than DistilBERT’s maximum input length:

In [33]:
# this function does the above
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [34]:
# mapping 
train_tokenized = train.map(tokenize_and_align_labels, batched=True)
test_tokenized = test.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [35]:
print(train_tokenized)

Dataset({
    features: ['id', 'ner_tags', 'tokens', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 16000
})


In [36]:
print(train_tokenized[2])

{'id': 13280, 'ner_tags': [6, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['President', 'Mesa', 'submitted', 'his', 'resignation', 'Monday', ',', 'saying', 'he', 'could', 'no', 'longer', 'lead', 'the', 'poor', 'Andean', 'nation', 'in', 'the', 'face', 'of', 'continuing', 'large', 'demonstrations', 'over', 'his', 'government', "'s", 'policies', '.'], 'input_ids': [101, 2343, 15797, 7864, 2010, 8172, 6928, 1010, 3038, 2002, 2071, 2053, 2936, 2599, 1996, 3532, 1998, 11219, 3842, 1999, 1996, 2227, 1997, 5719, 2312, 13616, 2058, 2010, 2231, 1005, 1055, 6043, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 6, 4, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100, 0, 0, -100]}


In [37]:
# save as pikle file
import pickle
with open('../data/huggingface/train_tokenized.pkl', 'wb') as file:      
    pickle.dump(train_tokenized, file)
with open('../data/huggingface/test_tokenized.pkl', 'wb') as file:      
    pickle.dump(test_tokenized, file)

## Rough work

In [109]:
import inspect as i
import sys
sys.stdout.write(i.getsource(tokenized_inputs.word_ids))

    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
        """
        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.

        Args:
            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.

        Returns:
            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
            (several tokens will be mapped to the same word index if they are parts of that word).
        """
        if not self._encodings:
            raise ValueError(
                "word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
                " class)."
            )
        return self._encodings[batch_index].word_ids


930

In [156]:
sys.stdout.write(i.getsource(tokenized_inputs.word_to_tokens))

    def word_to_tokens(
        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
    ) -> Optional[TokenSpan]:
        """
        Get the encoded token span corresponding to a word in a sequence of the batch.

        Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:

        - **start** -- Index of the first token.
        - **end** -- Index of the token following the last token.

        Can be called as:

        - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
        - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to
          1

        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
        words.

        Args:
            batch_or_word_index (`i

2619

In [121]:
example = wnut["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']


In [129]:
print(dir(tokenized_input))

['_MutableMapping__marker', '__abstractmethods__', '__class__', '__contains__', '__copy__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__reversed__', '__setattr__', '__setitem__', '__setstate__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_encodings', '_n_sequences', 'char_to_token', 'char_to_word', 'clear', 'convert_to_tensors', 'copy', 'data', 'encodings', 'fromkeys', 'get', 'is_fast', 'items', 'keys', 'n_sequences', 'pop', 'popitem', 'sequence_ids', 'setdefault', 'to', 'token_to_chars', 'token_to_sequence', 'token_to_word', 'tokens', 'update', 'values', 'word_ids', 'word_to_chars', 'word_to_tokens', 'words']
