this produces the datasets used by the NN models.  It has various hyperparamaters which affect the y labels.

There are two separate x input sets  : token vectors  (\_1) & a binary feature vector indicating the token to classify (\_2)

The hyper parameters for the label matrices are described in the filename for the y matrices

The y labels are used by both the CNN and UTH-CCB, ie the labels are the same for both models, only the inputs are different.

In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer
from datetime import datetime, timedelta
import gensim
import negation_data

data_root_folder = 'data'
data_raw_folder = data_root_folder + '/raw'
data_processed_folder = data_root_folder + '/processed'
data_cached_folder= data_root_folder + '/cached'

# 8=referral
# 9=primary
experiment=9

# study clinical finding tokens
study_phrases=negation_data.get_study_phrases()

# these labels match https://hhsrvmlr1.rvc.ac.uk:8888/notebooks/fp/False%20positives%20-%20CNN%20-%20Dataset%20production.ipynb
class_labels={
    0: '1-never diagnosed',
    1: '2-prior to diagnostic window',
    2: '3-after diagnostic window',
    3: '4-during diagnostic window'}


# hyper params
embedding_dimension=300 # size of token vectors
max_document_length=300 # truncate sentences longer than this
adjust_diagnosis_date_by_days = 0 # number of days backwards in time to adjust the diagnosis date by
window_before_diagnosis = 8 # number of hours before diagnosis to start the diagnosis window label (exclusive)
window_after_diagnosis = 0 # number of hours after diagnosis to end the diagnosis window label (inclusive)
number_of_classes=4 # controls the number of classes of y labels, choose amongst 2,3,4

def get_tokeniser(file_name): #eg filename='negation_detection_sentences_experiment_8_train.txt'
    def get_sentences_generator():
        df=pd.read_csv(data_processed_folder + '/' + file_name, sep=',',header=0)
    
        for index, row in df.iterrows():
            StudyId,PatientID,NoteID,CaseLabel,Sublabel,SentenceLabel,DiagnosisDate,SourceNoteRecordedDate,Sentence = row
            yield Sentence
    tokeniser=Tokenizer()
    tokeniser.fit_on_texts(get_sentences_generator())
    return tokeniser

def make_study_token_vectors(study_phrases,tokens):
    import re
    
    # implementation is not particularly efficient..
    
    # need to know when token spans start and end so we can compare without put of regex
    token_spans=list() # (token,token span start, token span end)
    token_start_index=0
    for token in tokens:
        token_start=token_start_index
        token_length=len(token)
        token_end=token_length + token_start # regex spans are inclusive of first position and exclusive of last position, ie 2 characters at start is span(0,2)
        token_spans.append((token,token_start,token_end))
        token_start_index=token_start_index + token_length + 1# one is whitespace between tokens
    
    # now match all the regex to the sentence
    sentence = ' '.join(tokens)
    
    study_token_vector=np.zeros((len(tokens),1),dtype='float32')
    
    for study_phrase in study_phrases:
        match=re.search(study_phrase, sentence)
        if not match:
            continue
        span_start,span_end=match.span()
        
        for i,(token,token_start,token_end) in enumerate(token_spans):
            if token_start >= span_start and token_end <= span_end:
                study_token_vector[i]=1.0
        break # only indicate the first study phrase (as there maybe more than on)
            
    return study_token_vector
        
def sentences_to_dataset(tokeniser,file_name):#eg filename='negation_detection_sentences_experiment_8_train.txt'
    df=pd.read_csv(data_processed_folder + '/' + file_name, sep=',',header=0)
    x=list()
    x_study_token_vectors=list()
    y=list()
    for index, row in df.iterrows():
        StudyId,PatientID,NoteID,CaseLabel,Sublabel,SentenceLabel,DiagnosisDate,SourceNoteRecordedDate,Sentence = row
        
        # create input 1. Tokens as integer (for embedding layer)
        sequences=tokeniser.texts_to_sequences([Sentence])
        x_i=np.concatenate(sequences) # tokeniser outputs sentences, jsut concat them back together
        x.append(x_i)
        
        # create input 2. binary feature vector 1 = token in study phrase, 0 = token not in study phrase
        phrases=study_phrases[str(StudyId).lower()]
        tokens=list([embedding_to_token_map[embedding_id] for embedding_id in x_i])
        binary_feature_vector=make_study_token_vectors(phrases,tokens)
        x_study_token_vectors.append(binary_feature_vector)
        
        # now create label vector
        
        # 3 alternate labeling methids, 2 commented out at any 1 time
        
        if number_of_classes == 2:
            # binary labels
            # 0/1 FP or not
            y_i = np.full((1),int(SentenceLabel))
            y.append(y_i)
        elif number_of_classes == 3:
            # 3 classes
            # 0 = never diagnosed
            # 1 = sentence written before dx
            # 2 = sentence written after dx
            if pd.isnull(DiagnosisDate):
                y.append(0)
            else:
                # pd currently has the dates as string.
                # see this for conversion https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior 
                date_format='%d/%m/%Y %X %p'
                dx_date=datetime.strptime(DiagnosisDate,date_format)
                note_date=datetime.strptime(SourceNoteRecordedDate,date_format)

                # adjust diagnosis date 
                dx_date=dx_date-timedelta(days=adjust_diagnosis_date_by_days)

                if note_date < dx_date:
                    y.append(1)
                else:
                    y.append(2)
        elif number_of_classes == 4:
             # 4 classes
        
            # 0 = never diagnosed
            # 1 = sentence written before diagnosis window
            # 2 = sentence written after diagnosis window
            # 3 = sentence written during diagnosis window
            if pd.isnull(DiagnosisDate):
                y.append(0)
            else:
                # pd currently has the dates as string.
                # see this for conversion https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior 
                date_format='%d/%m/%Y %X %p'
                dx_date=datetime.strptime(DiagnosisDate,date_format)
                note_date=datetime.strptime(SourceNoteRecordedDate,date_format)

                # adjust diagnosis date 
                dx_date_window_start=dx_date-timedelta(hours=window_before_diagnosis)
                dx_date_window_end=dx_date+timedelta(hours=window_after_diagnosis)

                #written before diagnosis window starts
                if note_date < dx_date_window_start:
                    y.append(1)
                #written after diagnosis window ends
                elif note_date >= dx_date_window_end:
                    y.append(2)
                #written during diagnosis window
                else:
                    y.append(3)

       
    # pre process
    x = pad_sequences(x,maxlen=max_document_length)
    x_study_token_vectors = pad_sequences(x_study_token_vectors,maxlen=max_document_length)
    lb=LabelBinarizer()
    y=lb.fit_transform(y)
    
    x = np.stack(x)
    x_study_token_vectors = np.stack(x_study_token_vectors)
    y = np.stack(y)
    return x,x_study_token_vectors,y
        

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_word2vec_new():
    return gensim.models.KeyedVectors.load_word2vec_format('data/processed/vec_sg_20180321.txt', binary=False, unicode_errors='ignore')

In [3]:
#load tokeniser & embeddings
def get_tokeniser():
    def get_sentences_generator():
        df=pd.read_csv(data_processed_folder + '/' + 'negation_detection_sentences_experiment_8_train.txt', sep=',',header=0)
    
        for index, row in df.iterrows():
            StudyId,PatientID,NoteID,CaseLabel,Sublabel,SentenceLabel,DiagnosisDate,SourceNoteRecordedDate,Sentence = row
            yield Sentence
        df=pd.read_csv(data_processed_folder + '/' + 'negation_detection_sentences_experiment_9_train.txt', sep=',',header=0)
    
        for index, row in df.iterrows():
            StudyId,PatientID,NoteID,CaseLabel,Sublabel,SentenceLabel,DiagnosisDate,SourceNoteRecordedDate,Sentence = row
            yield Sentence
    tokeniser=Tokenizer()
    tokeniser.fit_on_texts(get_sentences_generator())
    
    return tokeniser

tokeniser=get_tokeniser()

# inverse lookup embedding integer -> token
embedding_to_token_map = {v: k for k, v in tokeniser.word_index.items()}
embedding_to_token_map[0]='NULL'

num_token_types=len(tokeniser.word_index)
sg=load_word2vec_new()



In [None]:
# load sentence data and vectorise it
x_train,x_train_2,y_train  = sentences_to_dataset(tokeniser,file_name='negation_detection_sentences_experiment_%s_train.txt' % experiment)
x_dev,x_dev_2,y_dev        = sentences_to_dataset(tokeniser,file_name='negation_detection_sentences_experiment_%s_dev.txt' % experiment)
x_test,x_test_2,y_test     = sentences_to_dataset(tokeniser,file_name='negation_detection_sentences_experiment_%s_test.txt' % experiment)



In [None]:
x_train.shape,y_train.shape

In [None]:
sum(y_train)

In [None]:
# persist datasets

import numpy as np

def persist(x_1,x_2,y,dataset='train'):
    np.save('data/datasets/x_%s_1_exp_%s.npy' % (dataset,experiment), x_1)
    np.save('data/datasets/x_%s_2_exp_%s.npy' % (dataset,experiment), x_2)
    np.save('data/datasets/y_%s_exp_%s_classes_%s_window_before_diagnosis_%s_window_after_diagnosis_%s.npy' % 
            (dataset,experiment,number_of_classes,window_before_diagnosis,window_after_diagnosis),y)

persist(x_train, x_train_2, y_train,'train')
persist(x_dev  , x_dev_2  , y_dev,'dev')
persist(x_test , x_test_2 , y_test,'test')

In [None]:
num_token_types

In [4]:
# i used this to check that the sentence labels were being calculated accroding to the method in the paper.
def get_class(y):
    for i in range(0,number_of_classes):
        if y[i]==1:
            return i
      
def smoke_test_classifications():
    # these are the datasets produced by the method
    x_dev,x_dev_2,y_dev        = sentences_to_dataset(tokeniser,file_name='negation_detection_sentences_experiment_%s_dev.txt' % experiment)
    
    # lets compare them back to the source file
    file_name='negation_detection_sentences_experiment_%s_dev.txt' % experiment
    df=pd.read_csv(data_processed_folder + '/' + file_name, sep=',',header=0)
    for index, row in df.iterrows():
        StudyId,PatientID,NoteID,CaseLabel,Sublabel,SentenceLabel,DiagnosisDate,SourceNoteRecordedDate,Sentence = row
        sentence_class = get_class(y_dev[index])
        print('*** %s ***' %class_labels[sentence_class])
        print('Dx=%s DocumentDate=%s'%(DiagnosisDate,SourceNoteRecordedDate))
        print(Sentence)
        zz=four_class_label(DiagnosisDate,SourceNoteRecordedDate)
        if index == 20:
            return

