In [8]:
import re
import json
from codelin.models.const_tree import C_Tree
from codelin.encs.enc_const import *
from codelin.utils.constants import *
from nltk import Tree

def entities_to_tree(words, tags, entities):
    '''
    Converts a list of entities into a tree
    '''
    t = C_Tree("ROOT")
    for i in range(len(words)):
        w = words[i]
        p = tags[i]
        e = entities[i]

        terminal_tree = C_Tree(p, C_Tree(w))

        # No Entity
        if e == "NONE":
            t.add_child(terminal_tree)
        
        # Has entity
        else:
            if type(e) is list:
                # decend through the rightmost branch of the tree
                # until we find a branch with different entity name
                # or we reach the end of the tree
                cl = t
                idx = 0
                while cl.children and cl.children[-1].label == e[idx]:
                    cl = cl.children[-1]
                    idx += 1

                # add the rest of the entities as children
                t1 = C_Tree(e[idx])
                cl.add_child(t1)
                cl = t1
                for j in range(idx+1, len(e)):
                    t1 = C_Tree(e[j])
                    cl.add_child(t1)
                    cl = t1
                
                cl.add_child(terminal_tree)
            
            # Single Entity
            else:
                # if rightmost branch of the tree has the same entity
                # add it as a child
                if t.children and t.children[-1].label == e:
                    t.children[-1].add_child(terminal_tree)
                # otherwise add a new branch
                else:
                    t.add_child(C_Tree(e, terminal_tree))
    return t

### NNE

In NNE the sentences are formated as follows

$w_1, w_2, w_3,$ (...) $w_n$<br>
$p_1, p_2, p_3,$ (...) $p_n$<br>
$se_1,ee_1 ne_1|se_2,ee_2 ne_2|$ (...) $|se_n,ee_n ne_n$

where $w_i$ are the words of the sentence, $p_i$ are the postags of the sentence, $se_i$ is the start of the entity, $ee_i$ is the end of the entity and $ne_i$ is the name of the entity.

In [2]:
path = "/home/droca1/Treebanks/nested_ner/NNE/"
files = ['train.txt', 'dev.txt', 'test.txt']

def parse_entities(e, l):
    '''
    Parse entitties from NNE and returns 
    a list of entities for each word in the sentence 
    or NONE if there is no entity
    '''
    es = e.split("|")
    entities = ["NONE"]*l
    for e in es:
        idxs, name = e.split(" ")
        name = name.rstrip()
        start, end = idxs.split(",")
        entity_range = range(int(start), int(end)+1)
        
        for j in entity_range:
            # add the index to differentiate between entities with the same name
            e_r = (name, len(entity_range))
            if "NONE" in entities[j]:
                entities[j] = e_r
            else:
                entities[j] = [entities[j], e_r] if type(entities[j]) is not list else [*entities[j], e_r]

                # List should be sorted by taking first entities with 'greater' range
                entities[j] = sorted(entities[j], key=lambda x: x[1], reverse=True)
    
    # return only the entity name
    for i in range(len(entities)):
        if type(entities[i]) is list:
            entities[i] = [e[0] for e in entities[i]]
        elif type(entities[i]) is tuple:
            entities[i] = entities[i][0]

    return entities
        

# Load the data into a list of sentences
sentences = []
tags      = []
entities  = []
train = path+files[0]
dev = path+files[1]
test = path+files[2]

data = open(dev,'r').read().split('\n\n')[:-1]
words, postags, entities = [], [], []

for s in data:
    # some lines are empty
    if s[0] == '\n':
        s=s[1:]
    
    lines = s.split('\n')
    w, p, e = lines[:3] + [None]*(3-len(lines))
    w = w.split(' ')
    p = p.split(' ')
    
    words.append(w)
    postags.append(p)
    entities.append(parse_entities(e, len(w)) if e else ["NONE"]*len(w))


s_idx = 5
cs, ct, ce = words[s_idx], postags[s_idx], entities[s_idx]

print("\nEncoding NER data using Tree to labels...\n\n")
print(f"{'WORDS':<15} {'POSTAGS':<10} {str('ENTITIES'):<100}")
print("-"*70)
for i in range(len(cs)):
    # format string as 10 characters for cs[i], 10 characters for ct[i] and the rest for ce[i]
    print(f"{cs[i]:<15} {ct[i]:<10} {str(ce[i]):<100}")

# Convert entities to trees
t = entities_to_tree(cs, ct, ce)
print("\nEntity Tree:\n")
Tree.fromstring(str(t)).pretty_print()

# Encode tree using CoDeLin
encoder = C_NaiveAbsoluteEncoding(separator="_", unary_joiner="+", reverse=False, binary=False, binary_direction=None, binary_marker="'b")
lnt = encoder.encode(t)
print("Linearized Tree:\n")
print(f"{'WORDS':<15} {'POSTAGS':<10} {str('LABELS'):<100}")
print("-"*70)
for w,p,f,l in lnt.iterrows():
    print(f"{str(w):<15} {str(p):<10} {str(l):<10}")

# Decode tree using CoDeLin
dnt = encoder.decode(lnt)
dnt = dnt.postprocess_tree(conflict_strat=C_STRAT_MAX, clean_nulls=True,)
print("Decoded Tree:\n")
Tree.fromstring(str(dnt)).pretty_print()


Encoding NER data using Tree to labels...


WORDS           POSTAGS    ENTITIES                                                                                            
----------------------------------------------------------------------
BELL            NNP        ['ORGCORP', 'NAME']                                                                                 
INDUSTRIES      NNP        ORGCORP                                                                                             
Inc.            NNP        ['ORGCORP', 'CORPJARGON']                                                                           
increased       VBD        NONE                                                                                                
its             PRP$       NONE                                                                                                
quarterly       NN         PERIODIC                                                                                 

### GENIA

In GENIA dataset the sentences are formatted as a json file as follows:
```
{
    "tokens": ["Expression", "of", "c-fos", ",", "c-jun", "and", "jun", "B", "in", "peripheral", "blood", "lymphocytes", "from", "young", "and", "elderly", "adults", "."], 
    "doc_id": "MEDLINE:93061407", "sent_id": "MEDLINE:93061407-0", 
    "entity_mentions": [
        {"start": 2, "end": 3, "entity_type": "DNA", "text": "c-fos"}, 
        {"start": 4, "end": 5, "entity_type": "DNA", "text": "c-jun"}, 
        {"start": 6, "end": 8, "entity_type": "DNA", "text": "jun B"}]
}
```

In [11]:
path = "/home/droca1/Treebanks/nested_ner/GENIA/ner-json/"
files = ["train.jsonlines", "dev.jsonlines", "test.jsonlines"]

words   = []
entities    = []
postags     = []    
train       = path+files[0]
dev         = path+files[1]
test        = path+files[2]

for line in open(dev,'r').readlines():
    s = json.loads(line)
    words.append(s['tokens'])
    postags.append(["NONE"]*len(s['tokens']))
    
    ## ESTO ES UN MIERDON
    ## DEBERIAMOS REUTILIZAR EL ANTERIOR

    current_entities = ["NONE"]*len(s['tokens'])
    for entity in s['entity_mentions']:
        entity_range = range(entity['start'], entity['end']+1)
        for j in entity_range:
            e_r = (entity['entity_type'], len(entity_range))
            if "NONE" in current_entities[j]:
                current_entities[j] = e_r
            else:
                current_entities[j] = [current_entities[j], e_r] if type(current_entities[j]) is not list else [*current_entities[j], e_r]
                # List should be sorted by taking first entities with 'greater' range
                current_entities[j] = sorted(current_entities[j], key=lambda x: x[1], reverse=True)
    
        # return only the entity name
        for i in range(len(entities)):
            if type(entities[i]) is list:
                entities[i] = [e[0] for e in entities[i]]
            elif type(entities[i]) is tuple:
                entities[i] = entities[i][0]
    

    entities.append(current_entities)


# print sample of data
s_idx = 1
cs, ct, ce = words[s_idx], postags[s_idx], entities[s_idx]

print("\nEncoding NER data from",s_idx,"using Tree to labels...\n\n")
print(f"{'WORDS':<20} {'POSTAGS':<10} {str('ENTITIES'):<100}")
print("-"*70)
for i in range(len(cs)):
    print(f"{cs[i]:<20} {ct[i]:<10} {str(ce[i]):<100}")

# Convert entities to trees
t = entities_to_tree(cs, ct, ce)
print("\nEntity Tree (max_depth =",t.depth(),"):\n")
Tree.fromstring(str(t)).pretty_print()

# Encode tree using CoDeLin
encoder = C_NaiveAbsoluteEncoding(separator="_", unary_joiner="+", reverse=False, binary=False, binary_direction=None, binary_marker="'b")
lnt = encoder.encode(t)
print("Linearized Tree:\n")
print(f"{'WORDS':<20} {'POSTAGS':<10} {str('LABELS'):<100}")
print("-"*70)
for w,p,f,l in lnt.iterrows():
    print(f"{str(w):<20} {str(p):<10} {str(l):<10}")

# Decode tree using CoDeLin
dnt = encoder.decode(lnt)
dnt = dnt.postprocess_tree(conflict_strat=C_STRAT_MAX, clean_nulls=True,)
print("Decoded Tree:\n")
Tree.fromstring(str(dnt)).pretty_print()



# for s_idx in range(0,200):
#     cs, ct, ce = words[s_idx], postags[s_idx], entities[s_idx]

#     print("\nEncoding NER data from",s_idx,"using Tree to labels...\n\n")
#     print(f"{'WORDS':<20} {'POSTAGS':<10} {str('ENTITIES'):<100}")
#     print("-"*70)
#     for i in range(len(cs)):
#         print(f"{cs[i]:<20} {ct[i]:<10} {str(ce[i]):<100}")

#     # Convert entities to trees
#     t = entities_to_tree(cs, ct, ce)
#     print("\nEntity Tree (max_depth =",t.depth(),"):\n")
#     Tree.fromstring(str(t)).pretty_print()

#     # Encode tree using CoDeLin
#     encoder = C_NaiveAbsoluteEncoding(separator="_", unary_joiner="+", reverse=False, binary=False, binary_direction=None, binary_marker="'b")
#     lnt = encoder.encode(t)
#     print("Linearized Tree:\n")
#     print(f"{'WORDS':<20} {'POSTAGS':<10} {str('LABELS'):<100}")
#     print("-"*70)
#     for w,p,f,l in lnt.iterrows():
#         print(f"{str(w):<20} {str(p):<10} {str(l):<10}")

#     # Decode tree using CoDeLin
#     dnt = encoder.decode(lnt)
#     dnt = dnt.postprocess_tree(conflict_strat=C_STRAT_MAX, clean_nulls=True,)
#     print("Decoded Tree:\n")
#     Tree.fromstring(str(dnt)).pretty_print()

IndexError: list index out of range

###