## Explore CoNLL-2003 NER Tags

In [1]:
import pandas as pd

In [2]:
TRAIN_FILE_PATH = '/Users/tmorrill002/Documents/datasets/conll/raw/ner/eng.train'
VAL_FILE_PATH = '/Users/tmorrill002/Documents/datasets/conll/raw/ner/eng.testa'
TEST_FILE_PATH = '/Users/tmorrill002/Documents/datasets/conll/raw/ner/eng.testb'

In [3]:
def read_data(file_path):
    with open(file_path, 'r') as f:
        data = f.read()
    return data

In [4]:
train_data = read_data(TRAIN_FILE_PATH)
val_data = read_data(VAL_FILE_PATH)
test_data = read_data(TEST_FILE_PATH)

In [5]:
def split_documents(data):
    return data.split('-DOCSTART- -X- -X- O\n\n')[1:]

In [6]:
train_data = split_documents(train_data)
val_data = split_documents(val_data)
test_data = split_documents(test_data)

In [7]:
def split_sentences(data):
    split_data = []
    for doc in data:
        split_data.append(doc.split('\n\n'))
    return split_data

In [8]:
train_data = split_sentences(train_data)
val_data = split_sentences(val_data)
test_data = split_sentences(test_data)

In [9]:
def split_tokens(data):
    split_data = []
    for doc in data:
        split_doc = []
        for sentence in doc:
            tokens = sentence.split('\n')
            # remove blank lines
            tokens =  [tok for tok in tokens if tok != '']
            split_doc.append(tokens)
        split_data.append(split_doc)
    return split_data

In [10]:
train_data = split_tokens(train_data)
val_data = split_tokens(val_data)
test_data = split_tokens(test_data)

In [11]:
def split_tags(data):
    split_data = []
    for doc in data:
        split_doc = []
        for sentence in doc:
            split_sentence = []
            for example in sentence:
                tags = example.split(' ')
                split_sentence.append(tags)
            split_doc.append(split_sentence)
        split_data.append(split_doc)
    return split_data

In [12]:
train_data = split_tags(train_data)
val_data = split_tags(val_data)
test_data = split_tags(test_data)

### Data Checks

In [13]:
# refer to original paper for data checks
# https://www.aclweb.org/anthology/W03-0419.pdf

![Data Checks](data_checks.png)

In [14]:
TRAIN_ARTICLES = 946
VAL_ARTICLES = 216
TEST_ARTICLES = 231
assert len(train_data) == TRAIN_ARTICLES
assert len(val_data) == VAL_ARTICLES
assert len(test_data) == TEST_ARTICLES

In [15]:
TRAIN_SENTENCES = 14_987
VAL_SENTENCES = 3_466
TEST_SENTENCES = 3_684
assert TRAIN_SENTENCES == sum([len(doc)for doc in train_data])
assert VAL_SENTENCES == sum([len(doc)for doc in val_data])
assert TEST_SENTENCES == sum([len(doc)for doc in test_data])

In [16]:
def token_count(data):
    token_count = 0
    for doc in data:
        for sentence in doc:
            token_count += len(sentence)
    return token_count

In [17]:
TRAIN_TOKENS = 203_621
VAL_TOKENS = 51_362
TEST_TOKENS = 46_435
assert TRAIN_TOKENS == token_count(train_data)
assert VAL_TOKENS == token_count(val_data)
assert TEST_TOKENS == token_count(test_data)

In [18]:
#TODO: clean this function up
def count_tags(data):
    tag_counts = {'LOC': 0, 'MISC': 0, 'ORG': 0, 'PER': 0}
    for doc in data:
        for sentence in doc:
            i = 0
            break_count = len(sentence)
            for _ in range(len(sentence)):
                if i == break_count:
                    break
                word, pos_tag, chunk_tag, ner_tag = sentence[i]
                # check if it's a LOC tag
                if ner_tag == 'I-LOC' or ner_tag == 'B-LOC':
                    tag_counts['LOC'] += 1
                    ner_tag = 'I-LOC'
                    while ner_tag == 'I-LOC':
                        # advance the index by one and check the ner_tag 
                        i += 1
                        if i == break_count:
                            break
                        word, pos_tag, chunk_tag, ner_tag = sentence[i]
                # check if it's a MISC tag
                elif ner_tag == 'I-MISC' or ner_tag == 'B-MISC':
                    tag_counts['MISC'] += 1
                    ner_tag = 'I-MISC'
                    while ner_tag == 'I-MISC':
                        # advance the index by one and check the ner_tag 
                        i += 1
                        if i == break_count:
                            break
                        word, pos_tag, chunk_tag, ner_tag = sentence[i]
                # check if it's an ORG tag
                elif ner_tag == 'I-ORG' or ner_tag == 'B-ORG':
                    tag_counts['ORG'] += 1
                    ner_tag = 'I-ORG'
                    while ner_tag == 'I-ORG':
                        # advance the index by one and check the ner_tag 
                        i += 1
                        if i == break_count:
                            break
                        word, pos_tag, chunk_tag, ner_tag = sentence[i]
                # check if it's an PER tag
                elif ner_tag == 'I-PER' or ner_tag == 'B-PER':
                    tag_counts['PER'] += 1
                    ner_tag = 'I-PER'
                    while ner_tag == 'I-PER':
                        # advance the index by one and check the ner_tag 
                        i += 1
                        if i == break_count:
                            break
                        word, pos_tag, chunk_tag, ner_tag = sentence[i]
                else:
                    i += 1
    return tag_counts

In [19]:
TRAIN_TAGS = {'LOC': 7140, 'MISC': 3438, 'ORG': 6321, 'PER': 6600}
VAL_TAGS = {'LOC': 1837, 'MISC': 922, 'ORG': 1341, 'PER': 1842}
TEST_TAGS = {'LOC': 1668, 'MISC': 702, 'ORG': 1661, 'PER': 1617}
assert TRAIN_TAGS == count_tags(train_data)
assert VAL_TAGS == count_tags(val_data)
assert TEST_TAGS == count_tags(test_data)

In [20]:
#TODO: clean this function up
def id_tags(data):
    global_tag_id = 0
    for j, doc in enumerate(data):
        for k, sentence in enumerate(doc):
            i = 0
            break_count = len(sentence)
            for _ in range(len(sentence)):
                # print(i)
                if i == break_count:
                    break
                word, pos_tag, chunk_tag, ner_tag = sentence[i]
                # check if it's a LOC tag
                if ner_tag == 'I-LOC' or ner_tag == 'B-LOC':
                    data[j][k][i] = [word, pos_tag, chunk_tag, ner_tag, global_tag_id]
                    temp_tag = 'I-LOC'
                    m = 0
                    while temp_tag == 'I-LOC':
                        if m != 0:
                            data[j][k][i] = [word, pos_tag, chunk_tag, temp_tag, global_tag_id]
                        # advance the index by one and check the ner_tag 
                        i += 1
                        if i == break_count:
                            break
                        word, pos_tag, chunk_tag, temp_tag = sentence[i]
                        m += 1
                    global_tag_id += 1
                # check if it's a MISC tag
                elif ner_tag == 'I-MISC' or ner_tag == 'B-MISC':
                    data[j][k][i] = [word, pos_tag, chunk_tag, ner_tag, global_tag_id]
                    temp_tag = 'I-MISC'
                    m = 0
                    while temp_tag == 'I-MISC':
                        if m != 0:
                            data[j][k][i] = [word, pos_tag, chunk_tag, temp_tag, global_tag_id]
                        # advance the index by one and check the ner_tag 
                        i += 1
                        if i == break_count:
                            break
                        word, pos_tag, chunk_tag, temp_tag = sentence[i]
                        m += 1
                    global_tag_id += 1
                # check if it's an ORG tag
                elif ner_tag == 'I-ORG' or ner_tag == 'B-ORG':
                    # print('I-ORG', i)
                    data[j][k][i] = [word, pos_tag, chunk_tag, ner_tag, global_tag_id]
                    temp_tag = 'I-ORG'
                    m = 0
                    while temp_tag == 'I-ORG':
                        if m != 0:
                            # print(global_tag_id)
                            data[j][k][i] = [word, pos_tag, chunk_tag, temp_tag, global_tag_id]
                        # advance the index by one and check the ner_tag 
                        i += 1
                        if i == break_count:
                            break
                        word, pos_tag, chunk_tag, temp_tag = sentence[i]
                        # print(sentence[i])
                        m += 1
                    global_tag_id += 1
                # check if it's an PER tag
                elif ner_tag == 'I-PER' or ner_tag == 'B-PER':
                    data[j][k][i] = [word, pos_tag, chunk_tag, ner_tag, global_tag_id]
                    temp_tag = 'I-PER'
                    m = 0
                    while temp_tag == 'I-PER':
                        if m != 0:
                            data[j][k][i] = [word, pos_tag, chunk_tag, temp_tag, global_tag_id]
                        # advance the index by one and check the ner_tag 
                        i += 1
                        if i == break_count:
                            break
                        word, pos_tag, chunk_tag, temp_tag = sentence[i]
                        m += 1
                    global_tag_id += 1
                else:
                    if i == break_count:
                        break
                    word, pos_tag, chunk_tag, ner_tag = sentence[i]
                    data[j][k][i] = [word, pos_tag, chunk_tag, ner_tag, global_tag_id]
                    # print('O', i)
                    i += 1
                    global_tag_id += 1
    return data

In [21]:
# add tags
train_data = id_tags(train_data)
val_data = id_tags(val_data)
test_data = id_tags(test_data)

### Convert to Pandas and store

In [22]:
# normalize data and retain document, sentence, and token IDs

In [23]:
# train data
normalized_list = []
for i, doc in enumerate(train_data):
    for j, sentence in enumerate(doc):
        for k, example in enumerate(sentence):
            temp = [i, j, k] + example
            normalized_list.append(temp)

In [24]:
cols = ['Doc_ID', 'Sentence_ID', 'Token_ID', 'Token', 'POS_Tag', 'Chunk_Tag', 'NER_Tag', 'NER_Tag_ID']
df = pd.DataFrame(normalized_list, columns=cols)

In [25]:
df.head(20)

Unnamed: 0,Doc_ID,Sentence_ID,Token_ID,Token,POS_Tag,Chunk_Tag,NER_Tag,NER_Tag_ID
0,0,0,0,EU,NNP,I-NP,I-ORG,0
1,0,0,1,rejects,VBZ,I-VP,O,1
2,0,0,2,German,JJ,I-NP,I-MISC,2
3,0,0,3,call,NN,I-NP,O,3
4,0,0,4,to,TO,I-VP,O,4
5,0,0,5,boycott,VB,I-VP,O,5
6,0,0,6,British,JJ,I-NP,I-MISC,6
7,0,0,7,lamb,NN,I-NP,O,7
8,0,0,8,.,.,O,O,8
9,0,1,0,Peter,NNP,I-NP,I-PER,9


In [26]:
# do the tags ever start with B?
df[df['NER_Tag'].str.startswith('B')]

Unnamed: 0,Doc_ID,Sentence_ID,Token_ID,Token,POS_Tag,Chunk_Tag,NER_Tag,NER_Tag_ID
1819,9,14,19,Israel,NNP,I-NP,B-LOC,1746
3239,15,16,21,MiG-19,NNP,I-NP,B-MISC,3109
6355,36,3,1,County,NNP,I-NP,B-MISC,6046
8169,46,2,1,Davis,NNP,I-NP,B-MISC,7656
16673,82,0,1,CNB-120,JJ,I-NP,B-MISC,15603
...,...,...,...,...,...,...,...,...
145768,670,2,4,Super,NNP,I-NP,B-MISC,137876
160096,736,4,6,High,NNP,I-NP,B-LOC,151492
162376,744,13,4,Urdu-speaking,NNP,I-VP,B-MISC,153680
169937,781,3,10,MI-17,JJ,I-NP,B-MISC,160944
