In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./../data/AnonymizedClinicalAbbreviationsAndAcronymsDataSet.txt", 
                   encoding='cp1252', 
                   sep="|", 
                   header=None,
                   na_filter=False)

In [3]:
data.columns = ["abbrev", "sense", "represntaion", "start_pos", "end_pos", "section_info", "sample"]

In [4]:
data.head(10)

Unnamed: 0,abbrev,sense,represntaion,start_pos,end_pos,section_info,sample
0,AB,abortion,AB.,231,233,,_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida...
1,AB,abortion,AB.,249,251,,She is now bleeding quite heavily. Ultrasound ...
2,AB,abortion,AB,223,224,PAST OB HISTORY,ALLERGIES: Heparin and Imitrex. PAST OB HISTOR...
3,AB,abortion,AB.,194,196,HISTORY OF THE PRESENT ILLNESS,She had a pelvic ultrasound at Park Nicollet o...
4,AB,abortion,AB,114,115,PAST OB-GYN HISTORY,"On _%#MMDD2007#%_, normal anatomy with anterio..."
5,AB,ankle-brachial,AB,329,330,SIGNIFICANT FINDINGS,7. Laryngospasm. CONSULTANTS: 1. Nephrology. 2...
6,AB,abortion,AB,98,99,HISTORY OF PRESENT ILLNESS,HISTORY OF PRESENT ILLNESS: _%#NAME#%_ _%#NAME...
7,AB,blood group in ABO system,AB,292,293,PATIENT IDENTIFICATION,PATIENT IDENTIFICATION: _%#NAME#%_ _%#NAME#%_ ...
8,AB,abortion,AB,236,237,PAST MEDICAL HISTORY,PAST MEDICAL HISTORY: None except car accident...
9,AB,abortion,AB,65,66,,_%#NAME#%_ _%#NAME#%_ is a 25-year-old female ...


In [5]:
unique_abbrev = np.unique(data.abbrev)

In [6]:
empty_dict = dict.fromkeys(['abbrev','number_sense'])

In [7]:
abbrev_freq = pd.DataFrame(columns=["abbrev", "number_sense"])
#abbrev_freq_dict = dict.fromkeys(['abbrev','number_sense'])

count_list = []
for abbrev in unique_abbrev:
    piece = data.loc[data.abbrev == abbrev]
    count = len(np.unique(piece.sense))
    count_list.append(count)
    
abbrev_freq = pd.DataFrame({"abbrev" : unique_abbrev, "number_sense" : count_list})

In [8]:
abbrev_freq

Unnamed: 0,abbrev,number_sense
0,AB,12
1,AC,11
2,ALD,5
3,AMA,3
4,ASA,3
...,...,...
70,T3,6
71,T4,3
72,US,4
73,VAD,5


In [9]:
# feature extraction
# if select AC

In [10]:
from nltk.tokenize import word_tokenize, sent_tokenize
import string

In [11]:
def derive_features(abbrev, window_size):
    samples = data.loc[data.abbrev == abbrev, ]
    sample_num = 1
    
    original_features = pd.DataFrame(columns=["id", "features", "sense"])
    direction_features = pd.DataFrame(columns=["id", "features", "sense"])
    direction_num_features = pd.DataFrame(columns=["id", "features", "sense"])
    
    for i in range(samples.shape[0]):  # for each data point
        sentence_num = 1
        #target_word = samples.iloc[i, 2]  # should avoid using this, as word tokenization would split AC. to AC ., etc.
        target_word = abbrev
        
        text = samples.iloc[i, 6]
        id = i+1  # this is the row number of selected abbreviation
        sense = samples.iloc[i, 1]
        
        # sentence boundary
        # one sample can have multiple abbreviations in different sentences.
        sents = sent_tokenize(text)
        exclude = set(string.punctuation)
        #s = ''.join(ch for ch in s if ch not in exclude)
        for sent in sents:  # for each sentence
            #words = [token.lower() for token in word_tokenize(sent)] # word tokenization
            # remove punctuations from the words list
            words = [word for word in word_tokenize(sent) if word not in string.punctuation]
            #print(words)
            if target_word in words:
                #print(words)
                left_features = []
                right_features = []
                
                left_features_direction = []
                right_features_direction = []
                
                left_features_direction_num = []
                right_features_direction_num = []
                
                index = words.index(target_word)
                # find the targeted word
                # 1. See if the window-size exceeds the front and back limit
                    # If yes, start from the zero-th element, towards right till find the target (features on the left)
                            # start from the max-th element, towards left till find the target (features on the right)
                    # If no, start from the (index - 5)-th element, towards right till find the target
                           # start from the (index - 5)-th element, towards left till find the target
                        
                # extract features on the left
                if index - window_size < 0:
                    j = 0
                    starting = index
                    while words[j] != target_word:
                        
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        # remove punctuation from the word, meant to fix problem in word tokenization
                        # but may cause problem, e.g. p.r.n --> prn
                        # may remove later
                        
                        left_features.append(words[j].lower())
                        left_features_direction.append("L-" + words[j].lower())
                        left_features_direction_num.append("L" + str(starting) + "-" + words[j].lower())
                        j += 1
                        starting -= 1
                else:
                    j = index-window_size
                    starting = 0
                    for k in range(window_size):
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        left_features.append(words[j].lower())
                        left_features_direction.append("L-" + words[j].lower())
                        left_features_direction_num.append("L" + str(window_size-starting) + "-" + words[j].lower())
                        j += 1
                        starting += 1
                
                # extract feature on the right
                if index + window_size >= len(words):
                    #j = len(words)-1
                    j = index+1
                    starting = 1
                    while j != len(words):
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        right_features.append(words[j].lower())
                        right_features_direction.append("R-" + words[j].lower())
                        right_features_direction_num.append("R" + str(starting) + "-" + words[j].lower())
                        j += 1
                        starting += 1
                else: 
                    j = index+1
                    starting = 1
                    for k in range(window_size):
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        right_features.append(words[j].lower())
                        right_features_direction.append("R-" + words[j].lower())
                        right_features_direction_num.append("R" + str(starting) + "-" + words[j].lower())
                        j += 1
                        starting += 1
                   
                left_features_str = " ".join(left_features)
                right_features_str = " ".join(right_features)
                features = left_features_str + " " + right_features_str
                
                left_features_direction_str = " ".join(left_features_direction)
                right_features_direction_str = " ".join(right_features_direction)
                features_direction = left_features_direction_str + " " + right_features_direction_str
                
                left_features_direction_num_str = " ".join(left_features_direction_num)
                right_features_direction_num_str = " ".join(right_features_direction_num)
                features_direction_num = left_features_direction_num_str + " " + right_features_direction_num_str
                #print(features_direction_num)
                
                original_features = pd.concat([original_features, pd.DataFrame({"id" : [id],
                                                                               "features" : [features], 
                                                                               "sense" : [sense]})])
                
                direction_features = pd.concat([direction_features, pd.DataFrame({"id" : [id],
                                                                               "features" : [features_direction], 
                                                                               "sense" : [sense]})])
                
                direction_num_features = pd.concat([direction_num_features, pd.DataFrame({"id" : [id],
                                                                               "features" : [features_direction_num], 
                                                                               "sense" : [sense]})])
                
                
                #print(words)
                #print("--------------------")
                #print(left_features, right_features)
                #print(left_features_direction, right_features_direction)
                #print(left_features_direction_num, right_features_direction_num)
                #print("+++++++++++++++++++")
            sentence_num += 1
        sample_num += 1
    return([original_features, direction_features, direction_num_features])
                

In [12]:
a, b, c = derive_features('AC', 6)

In [13]:
b

Unnamed: 0,id,features,sense
0,1,L-required L-a L-fair L-bit L-of L-pepcid R-in...,(drug) AC
0,2,L-was L-degenerative L-changes L-seen L-in L-t...,acromioclavicular
0,3,L-patient L-will L-be L-a L-candidate L-for R-...,adriamycin cyclophosphamide
0,4,L-three L-years L-ago L-when L-she L-had,adriamycin cyclophosphamide
0,5,L-diskus L-5500 L-1 L-puff L-bid L-cheratussin...,(drug) AC
...,...,...,...
0,496,L-mastectomy L-the L-patient L-underwent L-che...,adriamycin cyclophosphamide
0,497,L-did L-report L-however L-receiving L-chemoth...,adriamycin cyclophosphamide
0,498,L-allergies L-renal L-dysfunction L-with L-use...,angiotensin-converting enzyme:ACE
0,499,L-bicep L-tendon L-no L-pain L-over L-the R-jo...,acromioclavicular
