In [None]:
import re
import os
from nltk.tokenize import word_tokenize

In [14]:
# import the data
path = ''
file_name = ''

os.chdir(path)
interventions = []
with open(file_name, 'r') as f:
    for line in f:
        interventions.append(line)

### 1. Extract all the text snippet that contains the parenthesis.
- assume the phrase in the parenthesis are the abbreviations...
- separate if multiple abbreviations appeared in one text snippet

In [45]:
acronyms = []
import re
for item in interventions:
    if re.search(r'\(.*?\)', item) is not None:
        acronyms.append(item)

In [46]:
# separate if multiple abbreviations appeared in one text snippet
def separate(item):
    indexes = [pos for pos, char in enumerate(item) if char == ')']
    phrases = []
    indexes.insert(0, -1)
    for i in range(1, len(indexes)):
        if indexes[i] == indexes[i - 1]:
            continue
        phrases.append(item[indexes[i - 1] + 1 : indexes[i] + 1].strip())
    print(phrases)
    return phrases
    
for item in acronyms:
    if len(re.findall(r'\(.*?\)', item)) > 1:
        acronyms.remove(item)
        acronyms_sep = acronyms + separate(item)

['hydroxychloroquine ( HCQ )', 'and lopinavir / ritonavir ( LPV / r )']


### 2. Remove unqualified phrases
- if the phrase in the parenthesis contains punctuations (e.g. comma, semi-colon, peroid), then ignore it.
- if the phrase does not contain upper-case characters, ignore it.

In [66]:
for item in acronyms_sep:
    inner = re.search(r'\(.*?\)', item).group(0)
    if re.search('[\.,：]', inner) is not None:
        acronyms_sep.remove(item)
    if re.search('[A-Z]', inner) is None:
        acronyms_sep.remove(item)

### 3. Functions used to match the abbreviations to full names
- outer_inner(item): extract the text left to the parentheses, and the text in the parentheses
- get_strings(outer, inner): a list of string lists recognized as potential full name for the inner abbreviation, the potential string list is recognized when the initial of the first word matches the first character of the abbreviations.
- match_1(word_list, inner): for the abbreviation without lower-case character, if the potential full name contains less than 2 words, then the initals should match the abbreviation, if more than 2 words, then there should exist at least 2 consecutive initials that could be found in the abbreviation. (because some initials may be ignored in the abbreviations)
- match_2(word_list, inner): for the abbreviation with lower-case characters, the lowercase characters together with the first left upper case character should match a word in the potential full name (initial n character match)

In [101]:
from nltk.tokenize import word_tokenize
# detect the full name
# apply the rules defined from the paper

def outer_inner(item):
    outer = item[:item.find('(')]
    inner = re.search(r'\((.*?)\)', item).group(1).strip()
    return outer, inner

def get_strings(outer, inner):
    target = inner[0]
    word_list = word_tokenize(outer)
    result_list = []
    for index, word in enumerate(word_list):
        if re.match(rf'{target}.*', word, re.I):
            result_list.append((word_list[index:]))
    return result_list

def match_1(word_list, inner):
    if len(word_list) <= 2:
        for index, word in enumerate(word_list):
            if word[0].lower() != inner[index].lower():
                return False
        return True
    else:
        for i in range(1,len(word_list) - 1):
            temp = word_list[i -1][0] + word_list[i][0]
            if re.search(temp, inner, re.I):
                return True
        return False

def match_2(word_list, inner):
    target = re.search(r'[A-Z][a-z]+', inner).group(0)
    for word in word_list:
        if re.search(target, word, re.I):
            return True
    return False

### 4. Map the abbreviations to full names

In [103]:
ab_mapper = {}
# tranverse all the text snippet
for item in acronyms_sep:
#     extract all the potential full names for the abbreviation
    outer, inner = outer_inner(item)
    potential = get_strings(outer, inner)
    
#     if no potential full name, continue
    if len(potential) == 0:
        continue
    
#     if abbreviation does not contain lower-case character
    if re.search(r'[A-Z][a-z]+', inner) is None:
        for word_list in potential:
#             if contains only one word, then add to dict
            if len(word_list) == 1:
                ab_mapper[inner] = word_list[0]
                break
            if match_1(word_list, inner):
                ab_mapper[inner] = ' '.join(word_list)
                break
#    if abbreviation contains lower-case character
    else:
        for word_list in potential:
#             if contains only one word, then add to the dict
            if len(word_list) == 1:
                ab_mapper[inner] = word_list[0]
                break
            if match_2(word_list, inner):
                ab_mapper[inner] = ' '.join(word_list)
                break

### 5. Todos:
- what if abbreviations contains multiple words? (revisively identify each word...)