In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_csv("./../data/AnonymizedClinicalAbbreviationsAndAcronymsDataSet.txt", 
                   encoding='cp1252', 
                   sep="|", 
                   header=None,
                   na_filter=False)

In [3]:
data.columns = ["abbrev", "sense", "represntaion", "start_pos", "end_pos", "section_info", "sample"]

In [4]:
data.head(10)

Unnamed: 0,abbrev,sense,represntaion,start_pos,end_pos,section_info,sample
0,AB,abortion,AB.,231,233,,_%#NAME#%_ _%#NAME#%_ is a 29-year-old gravida...
1,AB,abortion,AB.,249,251,,She is now bleeding quite heavily. Ultrasound ...
2,AB,abortion,AB,223,224,PAST OB HISTORY,ALLERGIES: Heparin and Imitrex. PAST OB HISTOR...
3,AB,abortion,AB.,194,196,HISTORY OF THE PRESENT ILLNESS,She had a pelvic ultrasound at Park Nicollet o...
4,AB,abortion,AB,114,115,PAST OB-GYN HISTORY,"On _%#MMDD2007#%_, normal anatomy with anterio..."
5,AB,ankle-brachial,AB,329,330,SIGNIFICANT FINDINGS,7. Laryngospasm. CONSULTANTS: 1. Nephrology. 2...
6,AB,abortion,AB,98,99,HISTORY OF PRESENT ILLNESS,HISTORY OF PRESENT ILLNESS: _%#NAME#%_ _%#NAME...
7,AB,blood group in ABO system,AB,292,293,PATIENT IDENTIFICATION,PATIENT IDENTIFICATION: _%#NAME#%_ _%#NAME#%_ ...
8,AB,abortion,AB,236,237,PAST MEDICAL HISTORY,PAST MEDICAL HISTORY: None except car accident...
9,AB,abortion,AB,65,66,,_%#NAME#%_ _%#NAME#%_ is a 25-year-old female ...


In [5]:
unique_abbrev = np.unique(data.abbrev)

In [6]:
empty_dict = dict.fromkeys(['abbrev','number_sense'])

In [7]:
abbrev_freq = pd.DataFrame(columns=["abbrev", "number_sense", "sense", "freq", "percentage"])
#abbrev_freq_dict = dict.fromkeys(['abbrev','number_sense'])

count_list = []
for abbrev in unique_abbrev:
    piece = data.loc[data.abbrev == abbrev]
    senses = np.unique(piece.sense)
    count = len(np.unique(piece.sense))
    for sense in senses:
        count_sense = piece.loc[piece.sense == sense].shape[0]
        percentage = count_sense/piece.shape[0]
        new = pd.DataFrame({"abbrev" : [abbrev],
                            "number_sense" : [count],
                            "sense" : [sense],
                            "freq" : [count_sense],
                            "percentage" : [percentage]})
        
        abbrev_freq = pd.concat([abbrev_freq, new])
    #count_list.append(count)
    
#abbrev_freq = pd.DataFrame({"abbrev" : unique_abbrev, "number_sense" : count_list})

In [8]:
np.unique(abbrev_freq.abbrev)

array(['AB', 'AC', 'ALD', 'AMA', 'ASA', 'AV', 'AVR', 'BAL', 'BK', 'BM',
       'BMP', 'C&S', 'C3', 'C4', 'CA', 'CDI', 'CEA', 'CR', 'CTA', 'CVA',
       'CVP', 'CVS', 'DC', 'DIP', 'DM', 'DT', 'EC', 'ER', 'ES', 'ET',
       'FISH', 'FSH', 'GT', 'IA', 'IB', 'IM', 'IR', 'IT', 'ITP', 'IVF',
       'LA', 'LE', 'MOM', 'MP', 'MR', 'MS', 'MSSA', 'NA', 'NAD', 'NP',
       'OP', 'OR', 'OTC', 'PA', 'PAC', 'PCP', 'PD', 'PDA', 'PE', 'PM',
       'PR', 'PT', 'RA', 'RT', 'SA', 'SBP', 'SMA', 'SS', 'T1', 'T2', 'T3',
       'T4', 'US', 'VAD', 'VBG'], dtype=object)

In [68]:
abbrev_freq.loc[abbrev_freq.abbrev == "AMA"]

Unnamed: 0,abbrev,number_sense,sense,freq,percentage
0,AMA,3,advanced maternal age,31,0.062
0,AMA,3,against medical advice,444,0.888
0,AMA,3,antimitochondrial antibody,25,0.05


In [10]:
# feature extraction
# if select AC

In [11]:
from nltk.tokenize import word_tokenize, sent_tokenize
import string

In [24]:
def derive_features(abbrev, window_size, replace = True):  # replace = True, replace numbers with zero
    samples = data.loc[data.abbrev == abbrev, ]
    sample_num = 1
    
    original_features = pd.DataFrame(columns=["id", "features", "sense"])
    direction_features = pd.DataFrame(columns=["id", "features", "sense"])
    direction_num_features = pd.DataFrame(columns=["id", "features", "sense"])
    
    for i in range(samples.shape[0]):  # for each data point
        sentence_num = 1
        #target_word = samples.iloc[i, 2]  # should avoid using this, as word tokenization would split AC. to AC ., etc.
        #target_word = abbrev
        
        text = samples.iloc[i, 6]
        id = i+1  # this is the row number of selected abbreviation
        sense = samples.iloc[i, 1]
        
        target_word = samples.iloc[i, 2]
        sentence_length = 0
        start_pos = int(samples.iloc[i, 3])
        end_pos = int(samples.iloc[i, 4])
        
        target_word = text[start_pos : end_pos+1]
        detect = True
        #print(target_word)
        # sentence boundary
        # one sample can have multiple abbreviations in different sentences.
        # before sentence boundary, replace all the numbers with 0 
        #print(text)
        
        
        #if replace:
        #    text = re.sub("\d+", "0", text)
        #    text = re.sub("\d+\.\d+", "0", text)

        sents = sent_tokenize(text)
        exclude = set(string.punctuation)
        #s = ''.join(ch for ch in s if ch not in exclude)
        for sent in sents:  # for each sentence
            #words = [token.lower() for token in word_tokenize(sent)] # word tokenization
            # remove punctuations from the words list
            sentence_length += (len(sent)+1)
            words = [word for word in word_tokenize(sent) if word not in string.punctuation]
            
#             print(id, "____________________")
#             print(sent)
#             print(words)
#             print(target_word)
#             print(sentence_length, end_pos)
            if sentence_length >= end_pos and (abbrev in words or target_word in words) and detect:
                #print(words)
                #print(sent)
                detect = False
                left_features = []
                right_features = []
                
                left_features_direction = []
                right_features_direction = []
                
                left_features_direction_num = []
                right_features_direction_num = []
                
                index = words.index(abbrev)
                # find the targeted word
                # 1. See if the window-size exceeds the front and back limit
                    # If yes, start from the zero-th element, towards right till find the target (features on the left)
                            # start from the max-th element, towards left till find the target (features on the right)
                    # If no, start from the (index - 5)-th element, towards right till find the target
                           # start from the (index - 5)-th element, towards left till find the target
                        
                # extract features on the left
                if index - window_size < 0:
                    j = 0
                    starting = index
                    #while words[j] != target_word:
                    while words[j] != abbrev:
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        if replace:
                            words[j] = re.sub("\d+", "0", words[j])
                            words[j] = re.sub("\d+\.\d+", "0", words[j])
                        # remove punctuation from the word, meant to fix problem in word tokenization
                        # but may cause problem, e.g. p.r.n --> prn
                        # may remove later
                        
                        left_features.append(words[j].lower())
                        left_features_direction.append("L-" + words[j].lower())
                        left_features_direction_num.append("L" + str(starting) + "-" + words[j].lower())
                        j += 1
                        starting -= 1
                else:
                    j = index-window_size
                    starting = 0
                    for k in range(window_size):
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        if replace:
                            words[j] = re.sub("\d+", "0", words[j])
                            words[j] = re.sub("\d+\.\d+", "0", words[j])
                        left_features.append(words[j].lower())
                        left_features_direction.append("L-" + words[j].lower())
                        left_features_direction_num.append("L" + str(window_size-starting) + "-" + words[j].lower())
                        j += 1
                        starting += 1
                
                # extract feature on the right
                if index + window_size >= len(words):
                    #j = len(words)-1
                    j = index+1
                    starting = 1
                    while j != len(words):
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        if replace:
                            words[j] = re.sub("\d+", "0", words[j])
                            words[j] = re.sub("\d+\.\d+", "0", words[j])
                        right_features.append(words[j].lower())
                        right_features_direction.append("R-" + words[j].lower())
                        right_features_direction_num.append("R" + str(starting) + "-" + words[j].lower())
                        j += 1
                        starting += 1
                else: 
                    j = index+1
                    starting = 1
                    for k in range(window_size):
                        words[j] = ''.join(ch for ch in words[j] if ch not in exclude)
                        if replace:
                            words[j] = re.sub("\d+", "0", words[j])
                            words[j] = re.sub("\d+\.\d+", "0", words[j])
                        right_features.append(words[j].lower())
                        right_features_direction.append("R-" + words[j].lower())
                        right_features_direction_num.append("R" + str(starting) + "-" + words[j].lower())
                        j += 1
                        starting += 1
                   
                left_features_str = " ".join(left_features)
                right_features_str = " ".join(right_features)
                features = left_features_str + " " + right_features_str
                
#                 print(features)
                
                left_features_direction_str = " ".join(left_features_direction)
                right_features_direction_str = " ".join(right_features_direction)
                features_direction = left_features_direction_str + " " + right_features_direction_str
                
                left_features_direction_num_str = " ".join(left_features_direction_num)
                right_features_direction_num_str = " ".join(right_features_direction_num)
                features_direction_num = left_features_direction_num_str + " " + right_features_direction_num_str
                #print(features_direction_num)
                
#                 if replace:
#                     features = re.sub("\d+", "0", features)
#                     features = re.sub("\d+\.\d+", "0", features)
                    
#                     features_direction = re.sub("\d+", "0", features_direction)
#                     features_direction = re.sub("\d+\.\d+", "0", features_direction)
                    
#                     features_direction_num = features_direction_num+re.sub("\d+", "0", features_direction_num)
#                     features_direction_num = features_direction_num+re.sub("\d+\.\d+", "0", features_direction_num)
#                     print(features_direction_num)
                original_features = pd.concat([original_features, pd.DataFrame({"id" : [id],
                                                                               "features" : [features], 
                                                                               "sense" : [sense]})])
                
                direction_features = pd.concat([direction_features, pd.DataFrame({"id" : [id],
                                                                               "features" : [features_direction], 
                                                                               "sense" : [sense]})])
                
                direction_num_features = pd.concat([direction_num_features, pd.DataFrame({"id" : [id],
                                                                               "features" : [features_direction_num], 
                                                                               "sense" : [sense]})])
            sentence_num += 1
        sample_num += 1
    return([original_features, direction_features, direction_num_features])
                

In [73]:
ar1, br1, cr1 = derive_features('AMA', 5, replace=True)

# ONE-HOT

In [74]:
cr1

Unnamed: 0,id,features,sense
0,1,L5-he L4-was L3-threatening L2-to L1-leave,against medical advice
0,2,L3-the L2-patient L1-left R1-on R2-the R3-morn...,against medical advice
0,3,L4-plan L3-patient L2-has L1-left R1-therefore...,against medical advice
0,4,,antimitochondrial antibody
0,5,L5-discharge L4-medications L3-craig L2-nystro...,against medical advice
...,...,...,...
0,496,L5-door L4-and L3-left L2-the L1-hospital,against medical advice
0,497,L3-discharge L2-plan L1-discharged,against medical advice
0,498,L5-and L4-rather L3-than L2-being L1-discharge...,against medical advice
0,499,L5-upstairs L4-the L3-patient L2-had L1-left,against medical advice


In [54]:
def one_hot_features(abbrev, window_size, replace = True):
    ar1, br1, cr1 = derive_features(abbrev, window_size, replace = True)
    one_hot_vector=[]
    for k in cr1['features'].values:
        features_direction_num= [item for item in k.split(' ') if item !='']
        one_hot_vector=one_hot_vector+features_direction_num
    one_hot_vector=sorted(list(set(one_hot_vector)))
    char_to_int = dict((c, i) for i, c in enumerate(one_hot_vector))
    onehot_encoded = list()
    for k in cr1['features'].values:
        features_direction_num= [item for item in k.split(' ') if item !='']
        integer_encoded = [char_to_int[char] for char in features_direction_num]
        # one hot encode

        letter = [0 for _ in range(len(char_to_int))]
        for value in integer_encoded:

            letter[value] = 1
        onehot_encoded.append(letter)
    onehot_encoded=np.array(onehot_encoded)
    return onehot_encoded,one_hot_vector

In [69]:
fea_5,name_5=one_hot_features('AMA', 5, replace=True)#not use ald as they have number of sense=1
fea_4,name_4=one_hot_features('AMA', 4, replace=True)
fea_3,name_3=one_hot_features('AMA', 3, replace=True)
fea_2,name_2=one_hot_features('AMA', 2, replace=True)
fea_1,name_1=one_hot_features('AMA', 1, replace=True)

In [70]:
fea_5.shape

(500, 1391)

In [71]:
name_1

['L1-0',
 'L1-a',
 'L1-a0',
 'L1-abbott',
 'L1-advice',
 'L1-an',
 'L1-ana',
 'L1-and',
 'L1-as',
 'L1-asma',
 'L1-asthma',
 'L1-by',
 'L1-cbc',
 'L1-center',
 'L1-centers',
 'L1-checking',
 'L1-day',
 'L1-department',
 'L1-detox',
 'L1-discharge',
 'L1-discharged',
 'L1-either',
 'L1-evaluated',
 'L1-facility',
 'L1-for',
 'L1-frequent',
 'L1-from',
 'L1-gerd',
 'L1-go',
 'L1-going',
 'L1-her',
 'L1-his',
 'L1-home',
 'L1-hospital',
 'L1-hospitalization',
 'L1-hours',
 'L1-is',
 'L1-labor',
 'L1-last',
 'L1-leave',
 'L1-leaves',
 'L1-leaving',
 'L1-left',
 'L1-name',
 'L1-negative',
 'L1-of',
 'L1-or',
 'L1-out',
 'L1-patient',
 'L1-point',
 'L1-positive',
 'L1-postdates',
 'L1-pprom',
 'L1-pregnancy',
 'L1-room',
 'L1-section',
 'L1-sign',
 'L1-signed',
 'L1-signing',
 'L1-so',
 'L1-the',
 'L1-them',
 'L1-there',
 'L1-to',
 'L1-twice',
 'L1-unit',
 'L1-unity',
 'L1-up',
 'L1-was',
 'L1-went',
 'R1-',
 'R1-0',
 'R1-a',
 'R1-after',
 'R1-against',
 'R1-although',
 'R1-ana',
 'R1-and',


In [75]:
y=cr1['sense'].values

In [76]:
y.shape

(500,)

In [41]:
#one_hot_vector

# machine learning to decide window size

In [77]:
from sklearn.model_selection import train_test_split

train_X_5, test_X_5, train_y, test_y = train_test_split(fea_5, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
train_X_4, test_X_4, train_y, test_y = train_test_split(fea_4, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
train_X_3, test_X_3, train_y, test_y = train_test_split(fea_3, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
train_X_2, test_X_2, train_y, test_y = train_test_split(fea_2, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)
train_X_1, test_X_1, train_y, test_y = train_test_split(fea_1, y, test_size=0.2, random_state=13, shuffle=True, stratify=y)

In [87]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

def five_fold_CV(clf, params, dx, dy):
    cv_model = RandomizedSearchCV(clf, params, scoring='f1_weighted', n_jobs=-1, 
                                  cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=13), 
                                  verbose=1, iid=True, n_iter=50, refit=True)
    
    cv_model.fit(dx, dy)
    
    return cv_model.best_estimator_, cv_model.best_score_

In [80]:
# use svm as a test
from sklearn.svm import SVC
svm = SVC(random_state=13)

# define search space
tuned_parameters = {"C": [0.001, 0.01, 0.1, 2, 8, 32, 64, 128, 512, 1024, 2048],
                    'gamma':['scale', 'auto'],
                    'probability':[True], 
                    'tol': [0.1, 0.01, 0.001, 0.0001]}

#cv


In [109]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
def report_result(y_test,predicted_target):
#     print('precision',precision_score(y_test, predicted_target,average='macro'))
#     print('f1',f1_score(y_test, predicted_target, average='macro'))
#     print('recall',recall_score(y_test, predicted_target, average='macro'))
    print('recall',accuracy_score(y_test, predicted_target))#recall should be accuracy if in WSD problem
    
    

In [88]:
best_svm_model, best_cv_performance = five_fold_CV(svm, tuned_parameters, train_X_5, train_y)
best_svm_model, best_cv_performance

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   24.3s finished


(SVC(C=1024, gamma='auto', probability=True, random_state=13, tol=0.0001),
 0.8985020384110807)

In [89]:
best_svm_model, best_cv_performance = five_fold_CV(svm, tuned_parameters, train_X_4, train_y)
best_svm_model, best_cv_performance

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:   16.2s finished


(SVC(C=1024, gamma='auto', probability=True, random_state=13),
 0.9083878959888574)

In [90]:
best_svm_model, best_cv_performance = five_fold_CV(svm, tuned_parameters, train_X_3, train_y)
best_svm_model, best_cv_performance

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 216 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    9.2s finished


(SVC(C=512, gamma='auto', probability=True, random_state=13, tol=0.1),
 0.9260987095056243)

In [91]:
best_svm_model, best_cv_performance = five_fold_CV(svm, tuned_parameters, train_X_2, train_y)
best_svm_model, best_cv_performance

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 219 out of 250 | elapsed:    2.6s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    3.0s finished


(SVC(C=512, gamma='auto', probability=True, random_state=13, tol=0.0001),
 0.9161786650422198)

In [92]:
best_svm_model, best_cv_performance = five_fold_CV(svm, tuned_parameters, train_X_1, train_y)
best_svm_model, best_cv_performance

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    1.2s finished


(SVC(C=2, probability=True, random_state=13, tol=0.0001), 0.9279876023360611)

In [110]:
clf = SVC(C=1024, gamma='auto', probability=True, random_state=13, tol=0.0001)
clf.fit(train_X_5, train_y)
predicted_target = clf.predict(test_X_5)
report_result(test_y,predicted_target)

recall 0.94


In [111]:
clf = SVC(C=1024, gamma='auto', probability=True, random_state=13)
clf.fit(train_X_4, train_y)
predicted_target = clf.predict(test_X_4)
report_result(test_y,predicted_target)

recall 0.94


In [112]:
clf = SVC(C=512, gamma='auto', probability=True, random_state=13, tol=0.1)
clf.fit(train_X_3, train_y)
predicted_target = clf.predict(test_X_3)
report_result(test_y,predicted_target)

recall 0.95


In [113]:
clf = SVC(C=512, gamma='auto', probability=True, random_state=13, tol=0.0001)
clf.fit(train_X_2, train_y)
predicted_target = clf.predict(test_X_2)
report_result(test_y,predicted_target)

recall 0.95


In [114]:
clf = SVC(C=2, probability=True, random_state=13, tol=0.0001)
clf.fit(train_X_1, train_y)
predicted_target = clf.predict(test_X_1)
report_result(test_y,predicted_target)

recall 0.91


In [None]:
#can select 2 or 3 as window size

In [115]:
#predicted_target

In [14]:
set([i+1 for i in range(500)]) - set(ar1.id)

set()

In [59]:
ab = data.loc[data.abbrev == "AB"]

In [60]:
ab.iloc[483, 6]

'The patient was asked to be seen for an Internal Medicine consult per Dr. _%#NAME#%_ _%#NAME#%_. HISTORY OF PRESENT ILLNESS: Patient _%#NAME#%_. is a 46-year-old female admitted to station 10-North from Fairview Ridges ER. The parents state that the patient has been having increasing symptoms of anxiety, paranoia and agitation the past several weeks.'