In [1]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import pickle

import pandas as pd
import numpy as np
import re
import os

In [None]:
! pip install sklearn-crfsuite; ls /opt/conda/lib/python3.9/site-packages/sklearn_crfsuite/

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# lab_names = pd.read_csv('/home/jovyan/work/NER-Test/data/labtest_dictionary_v01.csv')
# lab_names = lab_names['test'].tolist()

lab_names = pd.read_csv('/home/jovyan/work/NER-Test/data/labtest_dictionary_train_cleaned.csv')
lab_names = lab_names['test name'].tolist()

UNITS = pd.read_csv('/home/jovyan/work/NER-Test/data/labtest_units_v02.csv')
UNITS = UNITS['units'].tolist()
UNITS = [str(u).upper() for u in UNITS]
UNITS = list(set(UNITS))

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


def word2features(sent, i):
    word = str(sent[i][0])
    postag = str(sent[i][1])

    isTestName = True if len([token for w in lab_names for token in str(w).upper().split(' ') if token.find(word)>-1])>0 else False
    isUnit = True if word.upper() in UNITS else False
    isDecimal = True if re.findall(r'(\.)', word) and word.isdigit() else False
    isDigit = True if word.isdigit() else False
    isRange = True if re.match(r'\d*.?\d+-\d*.?\d+', word) else False
    #isRange = True if re.match(r'\(\d+-\d+\)', word) else False
    
    features = {
        'word.isTestName()': word if isTestName else '',
        'word.isUnit()': word if isUnit else '',
        'word.isDecimal()': word if isDecimal else '',
        'word.isDigit()': word if isDigit else '',
        'word.isRange()': word if isRange else '',
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    try:
        if i > 0:
            word1 = sent[i - 1][0]
            postag1 = sent[i - 1][1]

            isTestName1 = True if len([token for w in lab_names for token in str(w).upper().split(' ') if token.find(word)>-1])>0 else False
            isUnit1 = True if word.upper() in UNITS else False
            isDecimal1 = True if re.findall(r'(\.)', word1) and word1.isdigit() else False
            isDigit1 = True if word1.isdigit() else False
            isRange1 = True if re.match(r'\d*.?\d+-\d*.?\d+', word1) else False

            features.update({
                '-1word.isTestName()': word1 if isTestName1 else '',
                '-1word.isUnit()': word1 if isUnit1 else '',
                '-1word.isDecimal()': word1 if isDecimal1 else '',
                '-1word.isDigit()': word1 if isDigit1 else '',
                '-1word.isRange()': word1 if isRange1 else '',
                '-1postag': postag1,
                '-1postag[:2]': postag1[:2],
            })
        else:
            #features['BOS'] = True
            features.update({
                'BOS': word
            })

        if i < len(sent) - 1:
            word1 = sent[i + 1][0]
            postag1 = sent[i + 1][1]


            isTestName1 = True if len([token for w in lab_names for token in str(w).upper().split(' ') if token.find(word)>-1])>0 else False
            isUnit1 = True if word.upper() in UNITS else False
            isDecimal1 = True if re.findall(r'(\.)', word1) and word1.isdigit() else False
            isDigit1 = True if word1.isdigit() else False
            isRange1 = True if re.match(r'\d*.?\d+-\d*.?\d+', word1) else False

            features.update({
                '+1word.isTestName()': word1 if isTestName1 else '',
                '+1word.isUnit()': word1 if isUnit1 else '',
                '+1word.isDecimal()': word1 if isDecimal1 else '',
                '+1word.isDigit()': word1 if isDigit1 else '',
                '+1word.isRange()': word1 if isRange1 else '',
                '+1postag': postag1,
                '+1postag[:2]': postag1[:2],
            })
        else:
            #features['EOS'] = True        
            features.update({
                'EOS': word
            })
    except:
        print(sent,word)

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]


def extract_labs(t):
    if len(t)<1:
        return ['','']

    t = re.sub(r'\"','',str(t).strip())    
    t = re.sub(r'[^A-Z0-9-]',' ',str(t).strip().upper())            
    t = re.sub(r'([\s]*)(-)([\s]*)',r'\1 \3',str(t).strip().upper())

    wordlist = nltk.pos_tag(nltk.word_tokenize(str(t)))
    tagged_list = []
    for atagged in wordlist:
        tagged_list.append([0,atagged[0],atagged[1]])
    tagged_df = pd.DataFrame(tagged_list)
    if tagged_df.shape[0]==0:
        return ['','']
    tagged_df = tagged_df.rename(columns={0:'Sentence #',1:'Word',2:'POS'})
    
    #print(tagged_df.head())
    func = lambda s: [(w, p) for w, p in zip(s["Word"].values.tolist(), s["POS"].values.tolist())]
    grouped = tagged_df.groupby("Sentence #").apply(func)
    sentences = [s for s in grouped]
    test_sents = sentences
    
    X_test = [sent2features(s) for s in test_sents]
    y_pred = crf_model.predict(X_test)
    prediction = []
    for sent, label in zip(test_sents,y_pred):
        asent = []
        atruth = []
        apred = []
        for s, l in zip(sent, label):
            asent.append(s[0])

            if l == 'LAB':
                apred.append(s[0])
        prediction.append([' '.join(asent), ' '.join(apred)])
    sent, lab = prediction[0]    
    return [sent, lab]



In [4]:
def make_tagged_set(df_sent):

    df_sent = df_sent.fillna(value='')

    filtered = []
    for key, values in df_sent.iterrows():
        arow = []
        for v in values:
            
            v = re.sub(r'\"','',str(v).strip())            
            v = re.sub(r'[^A-Z0-9-]',' ',str(v).strip().upper())            
            v = re.sub(r'([\s]*)(-)([\s]*)',r'\1 \3',str(v).strip().upper())
            arow.append(v)

        tags = []
        term = ' '.join(arow[:-1])
        wordlist = nltk.pos_tag(nltk.word_tokenize(term))        
            
        for i in range(len(wordlist)):
            aword, apos = wordlist[i]
            atag = arow[-1] if arow[-1]!='' else 'O'

            tags.append((aword, apos, atag))

        filtered.append(pd.DataFrame(tags))

    tagged_list = []
    for key, values in enumerate(filtered):
        for k,atagged in values.iterrows():
            tagged_list.append([key, atagged[0], atagged[1], atagged[2]])
    tagged_df = pd.DataFrame(tagged_list)
    tagged_df = tagged_df.rename(columns={0: 'Sentence #', 1: 'Word', 2: 'POS', 3: 'Tag'})
    tagged_df.shape
    
    return tagged_df

In [None]:
data = pd.read_csv('/home/jovyan/work/NER-Test/data/train/train_test_samples_v04.csv')
data = data.dropna()
data.shape

In [None]:
data = pd.read_csv('data/train/train_round1.csv')
data.shape

In [None]:
data[data['CLASS']=='LAB'].shape

In [None]:

train = data[:40000].copy()
train = train.dropna(subset=['lab_result_0'])

train_tagged = make_tagged_set(train)
train_tagged = train_tagged.fillna(method="ffill")
words = list(set(train_tagged["Word"].values))
n_words = len(words)
tags = list(set(train_tagged["Tag"].values))
n_tags = len(tags)
getter = SentenceGetter(train_tagged)
train_sentences = getter.sentences

train_sents = train_sentences

In [None]:
test = data[40000:].copy()
test = test.dropna(subset=['lab_result_0'])
test_tagged = make_tagged_set(test)
test_tagged = test_tagged.fillna(method="ffill")
words = list(set(test_tagged["Word"].values))
n_words = len(words)
tags = list(set(test_tagged["Tag"].values))
n_tags = len(tags)
getter = SentenceGetter(test_tagged)
test_sentences = getter.sentences
test_sents = test_sentences

In [None]:
sent2features(train_sents[0])[0]

In [None]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

In [None]:
%%time
X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

In [None]:
len(X_test), len(y_test)

In [None]:
%%time

labels = tags # list(crf.classes_)

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=100, 
                        scoring=f1_scorer)




In [None]:
%%time

rs.fit(X_train, y_train)

In [None]:
# crf_model_lab_v01 all features
# crf_model_lab_v02 add isDigital
# crf_model_lab_v03 update BOS and EOS
# crf_model_lab_v04 9000 training samples -- big impact
# crf_model_lab_v05 fixed case insensitive
# crf_model_lab_v06 fixed term matching no BOS EOS
# crf_model_lab_v07 with BOS EOS --big impact
# crf_model_lab_v08 for numbers, use isDigital only -- no impact # best setting
# crf_model_lab_v09 use BOS EOS,add range pattern -- big impact cv score:0.9528583445541344, F1 Score: 0.845,0.964
# crf_model_lab_v10 use isDigital, isDecimal -- CV:0.9505771665104099, F1:0.841, 0.963 
# crf_model_lab_v11 use train_all_samples.csv
# crf_model_lab v12 use labtest_dictionary.csv and labtest_units.csv
# crf_model_lab v13 use train_test_samples.csv
# crf_model_lab v14 use train_test_samples_v03.csv
# crf_model_lab v15 use train_test_samples_v04.csv
# crf_model_lab v16 use train_test_samples_v04.csv with training dictionary
# crf_model_lab v16 use train_test_samples_v04.csv with training dictionary, labtest_units_v02.csv
# crf_model_lab_round1_v01.pkl use train_round1.csv
pickle.dump(rs, open('/home/jovyan/work/NER-Test/models/crf_model_lab_round1_v01.pkl','wb'))

# Evaluation

In [None]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
sorted_labels = tags
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))
crf = rs.best_estimator_
print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))
crf = rs.best_estimator_
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(50))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-50:])

In [None]:
import eli5
crf = rs.best_estimator_
eli5.show_weights(crf, top=(30,30))
#eli5.show_weights(crf, top=250)

In [None]:
! pip install eli5

# Simple Test

In [16]:
crf_model = pickle.load(open('/home/jovyan/work/NER-Test/models/crf_model_lab_v17.pkl','rb'))
v = 'PLATELETS                             434         '
extract_labs(v)

['PLATELETS 434', '']

# Testing

In [10]:
%%time
crf_model = pickle.load(open('/home/jovyan/work/NER-Test/models/crf_model_lab_v17.pkl','rb'))
data = pd.read_csv('data/train/train_round2.csv')
X_test = data.copy()
X_test = X_test.fillna(value='')
X_test['lab_result'] = X_test['lab_result_0'].apply(extract_labs)
X_test['LAB_TEXT'] = X_test['lab_result'].apply(lambda s:s[0])
X_test['LAB_TEST'] = X_test['lab_result'].apply(lambda s:s[1])
X_test.head()

CPU times: user 52min 22s, sys: 5.78 s, total: 52min 28s
Wall time: 52min 31s


Unnamed: 0,lab_result_0,CLASS,lab_result,LAB_TEXT,LAB_TEST
0,Patient,,"[PATIENT, ]",PATIENT,
1,"Name KIENTZ, SHERIL (65yo, F) ID# 6081528 A...",,[NAME KIENTZ SHERIL 65YO F ID 6081528 APPT DAT...,NAME KIENTZ SHERIL 65YO F ID 6081528 APPT DATE...,
2,DOB 05/28/1957,,"[DOB 05 28 1957, ]",DOB 05 28 1957,
3,Service Dept. Holton Family Medicine,,"[SERVICE DEPT HOLTON FAMILY MEDICINE, ]",SERVICE DEPT HOLTON FAMILY MEDICINE,
4,Provider MALIA WARNER MD,,"[PROVIDER MALIA WARNER MD, ]",PROVIDER MALIA WARNER MD,


In [11]:
X_test.to_csv('/home/jovyan/work/NER-Test/data/test/test-round2_v01.csv',index=False)

In [12]:
X_test = X_test.fillna(value='')

X_test['check'] = X_test.apply(lambda s: True if s[1]=='LAB' and s[4]=='' else False, axis=1)
X_test.shape, X_test[X_test['check']].shape

((38426, 6), (2841, 6))

In [13]:
# save misclassified
X_test[X_test['check']].drop_duplicates(subset=['lab_result_0']).to_csv('/home/jovyan/work/NER-Test/data/test/test-round2-false_v01.csv',index=False)

In [14]:
X_test.to_csv('/home/jovyan/work/NER-Test/data/test/test-round2_v01.csv',index=False)

In [None]:
df_verify = X_test.copy()
df_verify['check'] = df_verify['lab_result_0'].apply(lambda s: True if str(s).upper().find('CHLORIDE')>-1 else False)
df_verify[df_verify['check']].shape

In [None]:
df_verify[df_verify['check']].to_csv('data/verify-CHLORIDE.csv',index=False)