In [1]:
import spacy
import json
from spacy import displacy


tokenizer = spacy.load('en',disable=['parser','ner'])
parser = spacy.load('en',disable=['tokenizer'])
all_pipeline = spacy.load('en')

In [3]:
import time

def json_to_spacy(json_,tokenizer=False,parser=False):
    global processing_time
    json_ = json.loads(json_)
    text = json_['text']
    
    if not tokenizer:
        tokenizer = spacy.load('en',disable=['parser','ner'])
        
    if not parser:
        parser = spacy.load('en',disable=['tagger'])
    
    if not 'denotations' in json_:
        doc = tokenizer(json_['text'])
        
        start = time.clock() 
        
        for name, proc in parser.pipeline:
            doc = proc(doc)
        elapsed = time.clock()
        elapsed = elapsed - start
        processing_time += elapsed
        return all_pipeline(json_['text'])
    denotations = []
    # start, end, length, id
    for denotation in json_['denotations']:
        id_ = denotation['id']
        begin = denotation['span']['begin']
        end = denotation['span']['end']
        length = end - begin
        obj = denotation['obj']
        denotations.append({'id':id_,'begin':begin,'end':end,'length':length, 'obj':obj})
    
    denotations_begins = {}
    for denotation in denotations:
        if denotation['begin'] not in denotations_begins:
            denotations_begins[denotation['begin']] = []
        denotations_begins[denotation['begin']].append(denotation)
        
    longest_denotations = []
    current = -1
    for key, denotations in denotations_begins.items():
        if denotations[0]['begin'] > current:
            longest_denotation = max(denotations, key = lambda i: i['length'])
            longest_denotations.append(longest_denotation)
            current = longest_denotation['end']
        
    tokens = []
    tokens_ws = []
    current_denotation = 0
    advancement = 0
    endgame = False
    for token in tokenizer(text):
        if token.idx < advancement:
            continue
        
        if token.idx + len(token.text_with_ws) <= longest_denotations[current_denotation]['begin'] or endgame:
            tokens.append(token.text)
            tokens_ws.append(len(token.text) != len(token.text_with_ws))
            
        else:
            begin = longest_denotations[current_denotation]['begin']
            end = longest_denotations[current_denotation]['end']
            token_text = text[begin:end]
            tokens.append(token_text)
            tokens_ws.append(text[end:end+1] == ' ')
            
            advancement = end
                            
            if current_denotation + 1 < len(longest_denotations):
                current_denotation += 1
            else:
                endgame = True
                
    doc = spacy.tokens.Doc(tokenizer.vocab, words=tokens,
          spaces=tokens_ws)
    
    start = time.clock() 
    
    start = time.clock()
    for name, proc in parser.pipeline:
        doc = proc(doc)
    elapsed = time.clock()
    elapsed = elapsed - start
    processing_time += elapsed
    
    return doc

def spacy_to_json(doc,text=False,annotations=False):
    """Given a spaCy doc object, produce PubAnnotate JSON, that can be read by TextAE
       If original text is provided, original positions will be computed.
       If other annotations are provided, they will be included."""
    
    pre_json = { "text" : text }
    pre_json["denotations"] = list()
    pre_json["relations"] = list()
    
    current_position = 0
    for token in doc:
        token_dict = dict()
        token_dict["id"] = "T{}".format(token.i)
        
        if text:
            position = text[current_position:].find(token.text)
            
            # token not found
            if position == -1:
                verbose("Token {} could not be found when realigning spaCy with original text.".format(token.text))
                verbose("The following text except was searched: {} (starting from position {})".format(text[current_position:],current_position))
                continue
                
            token_dict["span"] = { "begin" : current_position + position , "end" : current_position + position + len(token.text)}
            current_position += position + len(token.text)
        else:
            token_dict["span"] = { "begin" : token.idx , "end" : token.idx + len(token)}
            
        token_dict["obj"] = token.tag_
        pre_json["denotations"].append(token_dict)
    
        relation_dict = dict()
        relation_dict["id"] = "R{}".format(token.i)
        relation_dict["subj"] = "T{}".format(token.i)
        relation_dict["obj"] = "T{}".format(token.head.i)
        relation_dict["pred"] = token.dep_
        pre_json["relations"].append(relation_dict)
        
    if annotations:
        annos = json.loads(annotations)
        if 'denotations' in annos:
            deno_ids = set([deno['id'] for deno in pre_json['denotations']])
            new_deno_ids = set([deno['id'] for deno in annos['denotations']])
            if deno_ids.intersection(new_deno_ids): 
                for denotation in annos['denotations']:
                    denotation['id'] = denotation['id'] + '*'
                
                if 'relations' in annos:
                    for relation in annos['relations']:
                        try:
                            relation['id'] = relation['id'] + '*'
                            relation['obj'] = relation['obj'] + '*'
                            relation['subj'] = relation['obj'] + '*'
                        except Exception as e:
                            raise(e)
                        
            pre_json["denotations"].extend(annos['denotations'])
    
                
                
        if 'relations' in annos:
            rel_ids = set([rel['id'] for rel in pre_json['relations']])
            new_rel_ids = set([rel['id'] for rel in annos['relations']])
            if rel_ids.intersection(new_rel_ids): 
                for relation in annos['relations']:
                    relation['id'] = relation['id'] + '*'
            pre_json["relations"].extend(annos['relations'])
        if 'text' in annos and pre_json['text'] == False:
            pre_json['text'] = annos['text']
            
    return(json.dumps(pre_json,sort_keys=True))

def display_stuff():
        # sans ner
    #doc1 = json.loads(json_original)
    #doc1 = json.dumps({'text':doc1['text']})
    #doc1 = json_to_spacy(doc1,tokenizer=tokenizer,parser=parser)
    # doc1 = spacy_to_json(doc1)
    #displacy.render(doc1, style='dep',jupyter=True)
    #svg1 = displacy.render(doc1, style='dep',jupyter=False)
    #open('data/compare/' + fn + '_sans.svg','w', encoding='utf-8').write(svg1)

    # avec ner
    # doc2 = json_to_spacy(json_original,tokenizer=tokenizer,parser=parser)
    # doc2 = spacy_to_json(doc2,annotations=json_original)
    # displacy.render(doc2, style='dep',jupyter=True)
    #svg2 = displacy.render(doc2, style='dep',jupyter=False)
    #open('data/compare/' + fn + '_avec.svg','w', encoding='utf-8').write(svg2)


    # doc = all_pipeline('Complex formation of platelet membrane glycoproteins IIb and IIIa with the fibrinogen D domain.')
    # displacy.render(doc, style='dep',jupyter=True)
    # svg1 = displacy.render(doc, style='dep',jupyter=False)
    # displacy.render(doc2, style='dep',jupyter=True)
    # svg2 = displacy.render(doc2, style='dep',jupyter=False)
    # json_ = spacy_to_json(doc2,annotations=json_original)

    # open('parse_with_ner.svg','w', encoding='utf-8').write(svg2)
    i=0

counter = 0
processing_time = 0
import glob    
for fn in glob.glob('data/abstracts_oger_1000/*'):
    with open(fn) as f:
        try:
            d = f.read()
            d = json.loads(d)
            d = str(json.dumps({'text':d['text']}))
        
            d = json_to_spacy(d,tokenizer=tokenizer,parser=parser)
            counter += 1
        except:
            print(fn)
        
    if counter%100 == 0:
        print(counter)
print('sans: ' + str(processing_time))

processing_time = 0
for fn in glob.glob('data/abstracts_oger_1000/*'):
    with open(fn) as f:
        d = f.read()
        try:
            d = json_to_spacy(d,tokenizer=tokenizer,parser=parser)
            counter += 1
        except:
            print(fn)
        
    if counter%100 == 0:
        print(counter)
print('avec: ' + str(processing_time))
    






100
200
300
400
500
600
700
800
900
1000
sans: 93.09315700000172




1100
data/abstracts_oger_1000/24129038.json
1200
1300
data/abstracts_oger_1000/24471862.json
data/abstracts_oger_1000/24131785.json
1400
data/abstracts_oger_1000/24433237.json
1500
1600
1700
data/abstracts_oger_1000/23716200.json
data/abstracts_oger_1000/24534617.json
data/abstracts_oger_1000/24508241.json
1800
data/abstracts_oger_1000/24530402.json
data/abstracts_oger_1000/24406330.json
1900
data/abstracts_oger_1000/23582911.json
data/abstracts_oger_1000/23792325.json
avec: 101.75916700000039
