## Task description

For each of the .txt file under _genia_cord_19_ , generate protein annotation __.a1__ file in accordance with GENIA's format. Below gives you an example as in how to get process the MetaMap output. You should collect entities whose semantic type (STY) is __Amino Acid, Peptide, or Protein__ (notice that this is one type not three different type). 

You may want to refer to `MRCONSO.RRF` file in UMLS to get the mapping from concep id (CUI) to STY if you cannot get STY directly from metamap output.

You may also want to try `echo {sentence} | metamap --I` in the terminal just to give a brief understanding of the metamap output.


In [None]:
import numpy as np
import pandas as pd
from glob import glob
import os
import re
import pickle
import itertools
from collections import defaultdict

In [None]:
def process_output(output):
    '''
    args:
        output: dict, output from metamap
    returns:
        candidates: 
    '''

    phrases = output['AllDocuments'][0]['Document']['Utterances'][0]['Phrases']
    
    # stores all the mapped entities
    candidates = []    
    for phrase in phrases:
        
        # starting position for this phrase
        phrase_start_pos = int(phrase['PhraseStartPos'])
        
        
        if len(phrase['Mappings']) == 0:
            continue
        # get the first mapping 
        mapping = phrase['Mappings'][0]

    
        for candidate in mapping['MappingCandidates']:
            score = -int(candidate['CandidateScore'])

            # only append
            candidates.append(
                {
                    'CUI': candidate['CandidateCUI'],
                    'StartPos': int(candidate['ConceptPIs'][0]['StartPos']),
                    'CandidatePreferred': candidate['CandidatePreferred']
                }
            )
        

    return candidates


In [None]:
import subprocess
import json
def get_metamap_output(sentence):
    '''
    Given a sentence return the metamap best matching result (score, ID, term)
    sentence: str
    
    '''
    p = subprocess.Popen(f"echo {sentence} | metamap --JSONn --I", stdout=subprocess.PIPE, shell=True)
    output, err = p.communicate()
    output = str(output, 'utf-8')

    output = output.split('\n')
    
    # no mapped entities
    if len(output) < 2:
        return None
    try:
        output = json.loads(output[1])
        output = process_output(output)
        return output       
    # JSON Decoder
    except:
        return None


    