# NERTag Docs

A very minimal doc that allows you to interact with the NERTag autolabeling package.

In [1]:
import sys

sys.path.append("..")

In [2]:
import string
from collections import Counter

import pandas as pd
from nertag import ner, preprocess, tagging, tool, utils

## 1. Setup

---

> Note: Data used in this example comes from [collinsdictionary](https://www.collinsdictionary.com/us/word-lists).

In [3]:
# --- Data
path = "../data/data.txt"

with open(path, "r") as f:
    texts = f.read().split("\n")

In [4]:
# NOTE: All input text must contain at most a single whitespace between words.
# Also punctuation characters must always be connected to a word, else offsets won't align.
# i.e. (Hello . Need my account info) will fail...


def custom_filter(text: str, filters: set = set(string.punctuation)):
    return " ".join([word for word in text.split() if word not in filters])

In [5]:
texts = [custom_filter(text) for text in texts]

In [6]:
text = texts[1]
text

'a geometric figure having all its sides of equal length'

In [7]:
# --- Labels
path = "../data/taxonomy.csv"

df = utils.preprocess_df(
    pd.read_csv(path),
    stemmer=utils.lemmatizer,
    filters=utils.stop_words,
    tokenizer=utils.tokenizer,
)
dct = utils.setup_dict(df)

In [8]:
df

Unnamed: 0,MATH,PHYSICS,BIOLOGY
0,"(angle, acute)",(acceleration),(aerobic)
1,(addition),"(alternating, current)",(agglutination)
2,"(algorithm, or, algorism)",(ampere),(albino)
3,(angle),(amplifier),"(allelomorph, or, allele)"
4,(arc),(angstrom),(anaerobic)
...,...,...,...
171,"(fraction, vulgar)",,
172,(x-axis),,
173,(y-axis),,
174,(z-axis),,


### A. Preprocessor

Define text preprocessing function.

The inputs in order are:

1. preprocessing function
2. stemming function
3. stopwords list
4. larget n_gram
5. smallest n_gram + 1
6. next n_gram size

> Note: 4, 5, 6 ~ range(4, 5, 6)

In [9]:
preprocessor = ner.Preprocessor(
    preprocess.preprocess,
    stemmer=utils.lemmatizer,
    stop_words=utils.stop_words,
    start=4,
    stop=0,
    step=-1,
)

In [10]:
"""
Input: a string of text
Output: a dict of preprocessed n_grams and word indices
"""

preprocessor(text)

{'words': [['geometric figure side equal', 'figure side equal length'],
  ['geometric figure side', 'figure side equal', 'side equal length'],
  ['geometric figure', 'figure side', 'side equal', 'equal length'],
  ['geometric', 'figure', 'side', 'equal', 'length']],
 'offsets': [['1 2 6 8', '2 6 8 9'],
  ['1 2 6', '2 6 8', '6 8 9'],
  ['1 2', '2 6', '6 8', '8 9'],
  ['1', '2', '6', '8', '9']]}

### B. BaseLabeler

The inputs in order are:

1. base labeling function
1. wordization function

Define initial labeling behavior function.

In [11]:
baselabeler = ner.BaseLabeler(utils.base_label, utils.tokenizer)

In [12]:
"""
Input: a string of text
Output: a list of base labels for each word
"""

baselabeler(text)

[{'entity': 'O', 'word': 'a', 'index': 0, 'start': 0, 'end': 1},
 {'entity': 'O', 'word': 'geometric', 'index': 1, 'start': 2, 'end': 11},
 {'entity': 'O', 'word': 'figure', 'index': 2, 'start': 12, 'end': 18},
 {'entity': 'O', 'word': 'having', 'index': 3, 'start': 19, 'end': 25},
 {'entity': 'O', 'word': 'all', 'index': 4, 'start': 26, 'end': 29},
 {'entity': 'O', 'word': 'its', 'index': 5, 'start': 30, 'end': 33},
 {'entity': 'O', 'word': 'sides', 'index': 6, 'start': 34, 'end': 39},
 {'entity': 'O', 'word': 'of', 'index': 7, 'start': 40, 'end': 42},
 {'entity': 'O', 'word': 'equal', 'index': 8, 'start': 43, 'end': 48},
 {'entity': 'O', 'word': 'length', 'index': 9, 'start': 49, 'end': 55}]

### C. Tagger

The inputs in order are:

1. ner tagging function
2. taxonomy describing each entity

Define named-entity tagging logic function.

In [13]:
tagger = ner.Tagger(tagging.ner_tagging, dct)

In [14]:
"""
Input: a dict of n_grams and word indices
Output: a list of labels for each word defined in the taxonomy
"""

tagger(preprocessor(text))

[{'entity': 'B-MATH', 'word': 'geometric', 'index': 1},
 {'entity': 'B-MATH', 'word': 'equal', 'index': 8}]

### D. Pipeline

Create the NER labeling pipeline.

In [15]:
pipeline = ner.NER(preprocessor, baselabeler, tagger)

In [16]:
pd.DataFrame(pipeline.label(text))

Unnamed: 0,entity,word,index,start,end
0,O,a,0,0,1
1,B-MATH,geometric,1,2,11
2,O,figure,2,12,18
3,O,having,3,19,25
4,O,all,4,26,29
5,O,its,5,30,33
6,O,sides,6,34,39
7,O,of,7,40,42
8,B-MATH,equal,8,43,48
9,O,length,9,49,55


## 2. Label

---

### A. Sequential

Sequential labeling is great choice for testing or with smaller datasets.

In [17]:
%%time

results = pipeline.sequential_labeling(texts)


  0%|                                                                                                 | 0/256 [00:00<?, ?it/s][A
 94%|████████████████████████████████████████████████████████████████████████████████     | 241/256 [00:00<00:00, 2408.41it/s][A
                                                                                                                              [A

CPU times: user 111 ms, sys: 22.6 ms, total: 134 ms
Wall time: 143 ms


In [18]:
utils.to_dataframe(results)

Unnamed: 0,entity,word,index,start,end
0,O,a,0,0,1
1,B-MATH,mathematical,1,2,14
2,O,statement,2,15,24
3,O,that,3,25,29
4,O,two,4,30,33
...,...,...,...,...,...
7,O,an,7,37,39
8,B-BIOLOGY,ovum,8,40,44
9,O,and,9,45,48
10,O,a,10,49,50


### B. Parallel

Parallel labeling is better with larger datasets.

It also takes additional optional arguments:

* `chunksize`: defaults to 1
* `max_workers`: defaults to `os.cpu_count()`.

In [19]:
%%time

results = pipeline.parallel_labeling(texts)

  0%|          | 0/256 [00:00<?, ?it/s]

CPU times: user 202 ms, sys: 185 ms, total: 387 ms
Wall time: 7.43 s


In [20]:
utils.to_dataframe(results)

Unnamed: 0,entity,word,index,start,end
0,O,a,0,0,1
1,B-MATH,mathematical,1,2,14
2,O,statement,2,15,24
3,O,that,3,25,29
4,O,two,4,30,33
...,...,...,...,...,...
7,O,an,7,37,39
8,B-BIOLOGY,ovum,8,40,44
9,O,and,9,45,48
10,O,a,10,49,50


## 3. Save

---

During the labeling phase, it is possible that some labels were assigned to examples not defined within the taxonomy. An updated taxonomy with these labels can be created as follows:

In [21]:
taxonomyv2 = utils.to_taxonomy(results)
taxonomyv2

Unnamed: 0,MATH,BIOLOGY,PHYSICS
0,(mathematical,asexual,acceleration
1,Cartesian coordinate,"asexual,",alternating current
2,addition,"bacteria,",ampere
3,"addition,",cell,atom
4,angle,"cell,",atoms
...,...,...,...
191,vector,,
192,volume:,,
193,zero,,
194,"zero,",,


In [22]:
path = "../data/taxonomy2.csv"
taxonomyv2.to_csv(path, index=False)

> This can be used to:
> * grow the taxonomy for free
> * verify the quality of the labeling process.

## 4. Revisions

After making any changes to the taxonomy, whether it be adding new terms or removing old ones, it is important to ensure that the examples under each entity are consistent. A built-in tool using the same preprocessing and word set methodology used in the **NERTag** auto-labeler is used for revising conflicts between entities in the taxonomy.

> Note: Although this is placed as "step 4", it should be used anytime you want to use or label the taxonomy, hence, should be used even before step 1 of this notebook.

In [23]:
df = pd.read_csv(path)

In [24]:
# --- Example conflict
df.iloc[195]["PHYSICS"] = df.iloc[0]["MATH"]
df

Unnamed: 0,MATH,BIOLOGY,PHYSICS
0,(mathematical,asexual,acceleration
1,Cartesian coordinate,"asexual,",alternating current
2,addition,"bacteria,",ampere
3,"addition,",cell,atom
4,angle,"cell,",atoms
...,...,...,...
191,vector,,
192,volume:,,
193,zero,,
194,"zero,",,


In [25]:
# --- Create reviser
reviser = tool.TaxonomyReviser(df)

reviser.preprocess(
    utils.preprocess_df,
    df,
    stemmer=utils.lemmatizer,
    filters=utils.stop_words,
    tokenizer=utils.tokenizer,
)

In [26]:
history = reviser.revise(save_name="../data/taxonomy3")

2022-04-26 14:12:42 INFO     PROGRESS: 0 / 1
Choose an entity that the example should belong to:
- Labels: ['PHYSICS', 'MATH']
- Examples: ['(mathematical']
- Choices:



Enter an index (int) from [0, 1], -1 meaning neither:  1


2022-04-26 14:12:48 INFO     > Selected: MATH
2022-04-26 14:12:48 INFO     CHANGES:
2022-04-26 14:12:48 INFO     - MOVED `(mathematical` FROM [PHYSICS] TO [MATH]
2022-04-26 14:12:48 INFO     - MOVED `(mathematical` FROM [MATH] TO [MATH]
2022-04-26 14:12:48 INFO     
+---------------------------------------------------------------------------+



In [27]:
# NOTE: df != df_raw
reviser.df_raw

Unnamed: 0,MATH,BIOLOGY,PHYSICS
0,Cartesian coordinate,asexual,acceleration
1,addition,"asexual,",alternating current
2,"addition,","bacteria,",ampere
3,angle,cell,atom
4,angle of which is a right angle,"cell,",atoms
...,...,...,...
191,volume:,,
192,zero,,
193,"zero,",,
194,zero.,,


## 5. Extra

Not part of library, but we can also check the most popular words in the corpus that are defined in the taxonomy.

In [28]:
df2 = utils.preprocess_df(
    df, stemmer=utils.lemmatizer, filters=utils.stop_words, tokenizer=utils.tokenizer,
)
dct2 = utils.setup_dict(df2)

In [29]:
df2

Unnamed: 0,MATH,BIOLOGY,PHYSICS
0,((mathematical),(asexual),(acceleration)
1,"(Cartesian, coordinate)","(asexual,)","(alternating, current)"
2,(addition),"(bacteria,)",(ampere)
3,"(addition,)",(cell),(atom)
4,(angle),"(cell,)",(atom)
...,...,...,...
191,(vector),,
192,(volume:),,
193,(zero),,
194,"(zero,)",,


In [30]:
entities = {
    entity: [k for k in dct2 if dct2[k] == entity] for entity in set(dct2.values())
}
words = Counter([v for k in dct2 for v in k])

In [31]:
top_k = 10
words.most_common()[:top_k]

[('number', 8),
 ('a', 5),
 ('angle', 4),
 ('is', 3),
 ('the', 3),
 ('right', 2),
 ('which', 2),
 ('an', 2),
 ('denominator', 2),
 ('complex', 2)]

In [32]:
{e: k for k, v in entities.items() for e in v if "number" in e}

{frozenset({'complex', 'number'}): 'MATH',
 frozenset({'irrational', 'number'}): 'MATH',
 frozenset({'number'}): 'MATH',
 frozenset({'a', 'complex', 'is', 'number', 'number;'}): 'MATH',
 frozenset({'a', 'number', 'number,', 'or', 'real'}): 'MATH',
 frozenset({'an', 'irrational', 'number', 'number;', 'or'}): 'MATH',
 frozenset({'number', 'rational'}): 'MATH',
 frozenset({'number', 'real'}): 'MATH'}