In [1]:
from itertools import chain

#import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle

import pandas as pd
import numpy as np
import re
import os
import eli5



In [2]:
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [110]:
# lab_names = pd.read_csv('/home/jovyan/work/NER-Test/data/labtest_dictionary_v01.csv')
# lab_names = lab_names['test'].tolist()

lab_names = pd.read_csv('data/labtest_dictionary_train_cleaned_v04.csv')
lab_names = lab_names['test name'].tolist()

VALUES = pd.read_csv('data/labtest_additional_values_v02.csv')
VALUES = VALUES['values'].tolist()
VALUES = [str(v).upper for v in VALUES]

UNITS = pd.read_csv('data/labtest_units_v04.csv')
UNITS = UNITS['units'].tolist()
UNITS = [str(u).upper() for u in UNITS]
UNITS = list(set(UNITS))

ABNORMAL = pd.read_csv('data/labtest_abnormal.csv')
ABNORMAL = ABNORMAL['abnormal flag'].tolist()
ABNORMAL = [str(v).upper() for v in ABNORMAL]

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


def word2features(sent, i):
    word = str(sent[i][0])
    postag = str(sent[i][1])
    
    # values
    # units
    # references
    # abnormal flags
    # datetime
    
    isTestName = True if len([token for w in lab_names for token in str(w).upper().split(' ') if token.find(word)>-1])>0 else False
    isValue = True if re.match(r'\d*.?\d+|[0-9\.\-\s\>\<\=]+', word) or word in VALUES else False    
    isUnit = True if word.upper() in UNITS else False
    isDecimal = True if re.findall(r'(\.)', word) and word.isdigit() else False
    isDigit = True if word.isdigit() else False
    isRange = True if re.match(r'\([0-9\.\-\s\>\<\=]+\)', word) else False # match parenthesis+digits
    isAbnormal = True if word.upper() in ABNORMAL else False
    isDatetime = True if re.match(r'\d{1,2}\/\d{1,2}\/\d{2,4}[\s]+\d{1,2}:\d{1,2}[:\d+]*|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}:\d{1,2}[:\d+]*', str(word)) else False
    #isRange = True if re.match(r'\(\d+-\d+\)', word) else False
    
    features = {
        'word.isTestName()': word if isTestName else '',
        'word.isValue()': word if isValue else '',
        'word.isUnit()': word if isUnit else '',
        'word.isDecimal()': word if isDecimal else '',
        'word.isDigit()': word if isDigit else '',
        'word.isRange()': word if isRange else '',
        'word.isAbnormal()': word if isAbnormal else '',
        #'word.isDatetime()': word if isDatetime else '',
        #'postag': postag,
        #'postag[:2]': postag[:2],
    }
    try:
        if i > 0:
            word1 = sent[i - 1][0]
            postag1 = sent[i - 1][1]

            isTestName1 = True if len([token for w in lab_names for token in str(w).upper().split(' ') if token.find(word1)>-1])>0 else False
            isValue1 = True if re.match(r'\d*.?\d+|[0-9\.\-\s\>\<\=]+', word1)  or word in VALUES else False    
            isUnit1 = True if word1.upper() in UNITS else False
            isDecimal1 = True if re.findall(r'(\.)', word1) and word.isdigit() else False
            isDigit1 = True if word1.isdigit() else False
            isRange1 = True if re.match(r'\([0-9\.\-\s\>\<\=]+\)', word1) else False # match parenthesis+digits
            isAbnormal1 = True if word1.upper() in ABNORMAL else False
            isDatetime1 = True if re.match(r'\d{1,2}\/\d{1,2}\/\d{2,4}[\s]+\d{1,2}:\d{1,2}[:\d+]*|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}:\d{1,2}[:\d+]*', str(word1)) else False


            features.update({
                '-1word.isTestName()': word1 if isTestName1 else '',
                '-1word.isValue()': word1 if isValue1 else '',
                '-1word.isUnit()': word1 if isUnit1 else '',
                '-1word.isDecimal()': word1 if isDecimal1 else '',
                '-1word.isDigit()': word1 if isDigit1 else '',
                '-1word.isRange()': word1 if isRange1 else '',
                '-1word.isAbnormal()': word1 if isAbnormal1 else '',
                #'-1word.isDatetime()': word1 if isDatetime1 else '',
                #'-1postag': postag1,
                #'-1postag[:2]': postag1[:2],
            })
        else:
            #features['BOS'] = True
            features.update({
                'BOS': word
            })

        if i < len(sent) - 1:
            word1 = sent[i + 1][0]
            postag1 = sent[i + 1][1]

            isTestName1 = True if len([token for w in lab_names for token in str(w).upper().split(' ') if token.find(word1)>-1])>0 else False
            isValue1 = True if re.match(r'\d*.?\d+|[0-9\.\-\s\>\<\=]+', word1)  or word in VALUES else False    
            isUnit1 = True if word1.upper() in UNITS else False
            isDecimal1 = True if re.findall(r'(\.)', word1) and word.isdigit() else False
            isDigit1 = True if word1.isdigit() else False
            isRange1 = True if re.match(r'\([0-9\.\-\s\>\<\=]+\)', word1) else False # match parenthesis+digits
            isAbnormal1 = True if word1.upper() in ABNORMAL else False
            isDatetime1 = True if re.match(r'\d{1,2}\/\d{1,2}\/\d{2,4}[\s]+\d{1,2}:\d{1,2}[:\d+]*|\d{1,2}\/\d{1,2}\/\d{2,4}|\d{1,2}:\d{1,2}[:\d+]*', str(word1)) else False


            features.update({
                '+1word.isTestName()': word1 if isTestName1 else '',
                '+1word.isValue()': word1 if isValue1 else '',
                '+1word.isUnit()': word1 if isUnit1 else '',
                '+1word.isDecimal()': word1 if isDecimal1 else '',
                '+1word.isDigit()': word1 if isDigit1 else '',
                '+1word.isRange()': word1 if isRange1 else '',
                '+1word.isAbnormal()': word1 if isAbnormal1 else '',
                #'+1word.isDatetime()': word1 if isDatetime1 else '',
                #'+1postag': postag1,
                #'+1postag[:2]': postag1[:2],
            })
        else:
            #features['EOS'] = True        
            features.update({
                'EOS': word
            })
    except:
        print(sent,word)

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


In [111]:
def replace_dup(s):
    s = str(s).upper()
    terms = re.findall(r'\([s]*[A-Z0-9\s]+[\s]*\)',s)    
    for t in terms:
        tt = re.sub(r'[^A-Z0-9\s]','',t)
        #print(tt)
        if len(tt)>1:
            if len([ w for w in s.replace(t,'').split(' ') if w in re.split(r'[\s]+', str(tt).strip())])>0 :
                #print(t)
                s =  s.replace(t,'')
    return s

def clean_results(v):
    v = re.sub(r'\"','',str(v).strip())            
    v = re.sub(r'[^A-Z0-9\:\/\-\.\(\)]',' ',str(v).strip().upper())
    #v = re.sub(r'([^\s]+)([\s]*-[\s]*)([^\s]+)',r'\1-\2',str(v).strip().upper())
    v = re.sub(r'([^\s]+)([\s]*-[\s]*)([^\s]+)',r'\1-\3',str(v).strip().upper())
    v = replace_dup(v)
    v = re.sub(r'(\()[\s]+(\w+)[\s]*(\))',r'\1\2\3',v)
    v = ','.join(re.split(r'[\s]+',str(v)))
    v = re.sub(r'(\d{1,2}\/\d{1,2}\/\d{2,4})(,)(\d{1,2}:\d{1,2}[:\d+]*)','',v)
    return v

def verify_tagged_set(df_sent):
    # raw, lab name, value, unit, reference, abnormal flag, datetime
    df_sent = df_sent.fillna(value='')

    filtered = []
    for key, values in df_sent.iterrows():
        arow = []
        values = [re.sub(r'[\"|\'\[\]]','',str(w)) for w in values]
        #values[0] = v
        #print(v)
        tags = []
        #print(values)
        for aword in re.split(r',',str(values[0])):
            atag = ''
            apos = ''
            if len(values[2])>0 and len([w for w in values[2].split(' ') if str(w).upper() == str(aword).upper()]) >0:
                atag = 'VALUE'
                apos = 'CD'
                tags.append((aword, apos, atag))
            elif len(values[3])>0 and len([w for w in values[3].split(',') if str(aword).upper()==str(w).upper()]) >0:
                atag = 'UNIT'
                apos = 'CD'
                tags.append((aword, apos, atag))
            elif len(re.sub(r'([^\s]+)([\s]*-[\s]*)([^\s]+)',r'\1-\3',str(values[4]).strip().upper()))>0 and \
            len([w for w in re.sub(r'([^\s]+)([\s]*-[\s]*)([^\s]+)',r'\1-\3',str(values[4]).strip().upper()).split(' ') \
                 if str(aword).upper()==str(w).upper()]) >0:
                atag = 'REF'
                apos = 'CD'
                tags.append((aword, apos, atag))
            elif len(values[5])>0 and len([w for w in values[5].split(',') if str(aword).upper()==str(w).upper() or str(aword).upper()=='('+str(w)+')'.upper()]) >0:
                atag = 'ABNORM'
                apos = 'NN'
                tags.append((aword, apos, atag))
            elif len(values[6])>0 and len([w for w in values[6].split(',') if str(aword).upper()==str(w).upper()]) >0:
                atag = 'DTIME'
                apos = 'CD'
                tags.append((aword, apos, atag))
            else:
                atag = 'O'                
                wordlist = nltk.pos_tag(nltk.word_tokenize(aword)) 
                for i in range(len(wordlist)):
                    aword, apos = wordlist[i]
                    tags.append((aword, apos, atag))
        filtered.append(pd.DataFrame(tags))

    tagged_list = []
    for key, values in enumerate(filtered):
        for k,atagged in values.iterrows():
            tagged_list.append([key, atagged[0], atagged[1], atagged[2]])
    tagged_df = pd.DataFrame(tagged_list)
    tagged_df = tagged_df.rename(columns={0: 'Sentence #', 1: 'Word', 2: 'POS', 3: 'Tag'})
    tagged_df.shape
    
    return tagged_df

def make_tagged_set(df_sent):
    # raw, lab name, value, unit, reference, abnormal flag, datetime
    df_sent = df_sent.fillna(value='')
    filtered = []
    for key, values in df_sent.iterrows():
        arow = []
        tags = []
        #print(values)
        if len(re.split(r',[\s]*',re.sub(r'\[|\]','',values[0])))!= \
            len(re.split(r',[\s]*',re.sub(r'\[|\]','',values[1]))):
             print(values,len(values[0].split(',')), len(values[1].split(',')))
        else:    
            for aword, atag in zip(re.split(r',[\s]*',re.sub(r'\[|\]','',values[0])), \
                                   re.split(r',[\s]*',re.sub(r'\[|\]','',values[1]))):

                apos = ''
                if atag == 'VALUE':                
                    apos = 'CD'
                    tags.append((aword, apos, atag))
                elif atag == 'UNIT':                
                    apos = 'CD'
                    tags.append((aword, apos, atag))
                elif atag == 'REF':                
                    apos = 'CD'
                    tags.append((aword, apos, atag))
                elif atag == 'ABNORM':                
                    apos = 'NN'
                    tags.append((aword, apos, atag))
                elif atag == 'DTIME':                
                    apos = 'CD'
                    tags.append((aword, apos, atag))
                else:
                    aword = re.sub(r'[\(|\)]','',aword)
                    wordlist = nltk.pos_tag(nltk.word_tokenize(aword)) 
                    for i in range(len(wordlist)):
                        aword, apos = wordlist[i]
                        tags.append((aword, apos, atag))
            filtered.append(pd.DataFrame(tags))
    print(len(filtered))
    tagged_list = []
    for key, values in enumerate(filtered):
        for k,atagged in values.iterrows():
            tagged_list.append([key, atagged[0], atagged[1], atagged[2]])
    tagged_df = pd.DataFrame(tagged_list)
    tagged_df = tagged_df.rename(columns={0: 'Sentence #', 1: 'Word', 2: 'POS', 3: 'Tag'})
    print(tagged_df.shape)
    
    return tagged_df

In [None]:
def preprocessing(s):
    s = re.split(r'[\([s]*[A-Z0-9\s]+[\s]*\)]|[\s]+', str(s).upper())    
    return s

In [4]:
%%time

df_test_data = pd.read_csv('data/train/train_round123_v31_structured_v14_label_test_tagged.csv')
# df_test_data['lab_result_1'] = df_test_data['lab_result_0']
# df_test_data['lab_result_0'] = df_test_data['lab_result_0'].apply(clean_results)
#test_tagged = make_tagged_set(df_test_data[['lab_result_0','true_label']])
df_test_data['parsed'] = df_test_data['lab_result_1'].apply(preprocessing)
df_test_data['check'] = df_test_data[['lab_result_0','parsed']].apply(lambda s: re.split(r'[\s]*,[\s]*',re.sub(r'[\[\]\'\"]','',str(s[0])))==re.split(r'[\s]*,[\s]*',re.sub(r'[\[\]\'\"]','',str(s[1]))), axis=1)
df_test_data.to_csv('data/test/train_round123_v31_structured_v14_preprocessing.csv',index=False)

CPU times: user 128 ms, sys: 17.9 ms, total: 146 ms
Wall time: 168 ms
