In [1]:
# this produces & persists datasets for the UTH-CCB model

import scipy
from scipy.sparse import csr_matrix

experiment=9 #8=referral, 9 = primary

In [4]:

# this is a reproduction of :

# Xu, Jun, Yaoyun Zhang, Jingqi Wang, et al. 
# 2015UTH-CCB: The Participation of the SemEval 2015 Challenge–Task 14. Proceedings of SemEval-2015. 
# http://www.anthology.aclweb.org/S/S15/S15-2.pdf#page=353, accessed December 21, 2016.



import negation_data
import pandas as pd
import numpy as np
import sklearn
from collections import Counter
import gensim
from sklearn.feature_extraction import DictVectorizer
from pycorenlp import StanfordCoreNLP # requires the stanford CoreNLP server to be running

corenlp = StanfordCoreNLP('http://localhost:9001')

# 2 passes.  
# 1 : extract features
# 2 : create vectors


# a feature extraction function is a function from a sentence + metadata to a dictionary of string -> real

def extract_unigrams(study_id,tokens):

    features=Counter()
    for token in tokens:
        features[token] += 1.0
    return features

def extract_bigrams(study_id,tokens):
    stop="STOP"

    features=Counter()

    # handle first and last tokens
    features[stop + ' ' + tokens[0]] = 1.0
    last_token = tokens[len(tokens) - 1]
    features[last_token + '_' + stop] = 1.0
    for idx, token in enumerate(tokens):
        feature=""
        if idx==len(tokens) - 1:
            break
        feature = token + ' ' + tokens[idx + 1]
        features[feature] += 1.0
    
    return features

# regexes = list of tuples regex -> numpy array of feature representation
# n = number of surrounding tokens to include in regex matching for phrase matching, ie n=1 compares token + 1 tokens either seide of token
def extract_group_features(study_phrases, study_id, tokens):

    import re
    # handling multi token regex ie 'ruled out' is tricky with regards to tokenisation
    # we proceed as follows:
    # recombine tokens into a sentence 
    # match each regex to the sentence, and then determine if a subsequent match covers the span containing the token.
    # nb only matches first instance of a regex match.  I can't find a way of matching all in python!!!


    sentence = ' '.join(tokens)

    # a list of regex matches -> feature vectors iif the regex matched the sentence
    matches = list(filter(lambda x:x, [re.search(regex, sentence) for regex in study_phrases]))
    features = Counter() # a dict of our features
    token_start_position_in_sentence = 0 # where the token starts in the sentence
    for idx, token in enumerate(tokens):
        # print()
        token_end_position_in_sentence = (token_start_position_in_sentence + len(token)) - 1
        token_matched = False
        # print('%s %s' % (token_start_position_in_sentence, token_end_position_in_sentence))
        # print('considering %s' % token)
        for match in matches:
            start, end = match.span()
            # print(match)
            # print('span %s %s' % (start,end))
            if token_start_position_in_sentence >= start and  token_end_position_in_sentence < end:
                # token inside match, add the 8 surrounding tokens
                # print('token ' + token + ' is matched')

                # add the 8 preceding tokens as feature groups
                for n in range(idx - 1, max(-1,idx-8), -1):
                    token=tokens[n]
                    group_label = -1 # the paper groups tokens into either L1, L4 or L8 labels based on how lcose the surrounding token is to the matched disease
                    if idx-n == 1:
                        group_label = '_L1'
                    elif idx - n > 1 and idx - n  <= 4:
                        group_label = '_L4'
                    else:
                        group_label = '_L8'

                    feature = tokens[n] + group_label # ie 'said_L4' when 'said' within 4 tokens to the left of the match, but not the immeidately preceding token as this would get labelled _1
                    features[feature] += 1.0

                # add the 8 subsequent tokens as feature groups
                for n in range(idx + 1, min(len(tokens),idx + 8)):
                    token=tokens[n]
                    group_label = -1 # the paper groups tokens into either L1, L4 or L8 labels based on how lcose the surrounding token is to the matched disease
                    if n-idx == 1:
                        group_label = '_R1'
                    elif n-idx  > 1 and n-idx <= 4:
                        group_label = '_R4'
                    else:
                        group_label = '_R8'

                    feature = tokens[n] + group_label # ie 'said_L4' when 'said' within 4 tokens to the left of the match, but not the immeidately preceding token as this would get labelled _1
                    features[feature] += 1.0 

                token_matched = True
                break

        # we only perform feature extraction on the first token to be matched
        if token_matched:
            break

        token_start_position_in_sentence +=len(token) + 1 # set for next token
    return features

# regex features = (regex, feature vector, feature type) feature_vector is ignored by uth-ccb

def extract_ConText_features(regex_features, tokens):
    import re
    # uth-ccb paper is underspecified unfortinately!  here is what they say
    # "3) Lexicon features, including word lists for ne- gation, pseudo-negation, conjunction, condition, uncertainty, subject, severity, and course"

    # So, I will do something similar to negation.py : mark the presence or absence of the ConText classes

    sentence = ' '.join(tokens)

    # a list of regex matches -> feature vectors iif the regex matched the sentence
    matches = list(filter(lambda x:x[0], [(re.search(regex, sentence),feature_type) for (regex, regex_feature_vector,feature_type) in regex_features]))
    features=Counter()

    for match, feature_type in matches:
        features[feature_type] += 1.0
    return features

# compile the regex, otherwise this is very slow
import re
compiled_study_phrases={}
for studyid, regexes in negation_data.get_study_phrases().items():
    compiled_study_phrases[studyid]=[map(lambda x:re.compile(x),regexes)]

def extract_dependency_features(studyid, tokens):
    import re
    # the paper is a bit underspecified, here is what it says:
    # 4). Dependency relation features. We used the Stanford Parser to generate dependency relations of a sentence. We only counted dependency relations where a 
    # target disorder is the governor or the de- pendent in the relation. We extracted all these syn- tactic relations as features.
    # The most sensible thing seems to be:
    #   match the glosses for governor & dependencies to the study regex, if matched add a feature uniquely identifying the glosses & their relationship

    features=Counter()
    
    # just ignore any parsing errors, some sentences seem to be unparsable, and an error state in the output
    try:
        # uses stanford corenlp 
        sentence = ' '.join(tokens)
        output = corenlp.annotate(sentence, properties={'annotators':'depparse','outputFormat':'json'})
        dependencies = output['sentences'][0]['basicDependencies']

        
        for reg in compiled_study_phrases[studyid]:
            for dependency in dependencies:
                if reg.search(' %s ' % dependency['governorGloss']):
                    # disease is a governor
                    features['governor_%s_->_%s' % (dependency['governorGloss'],dependency['dependentGloss'])]+=1
                if reg.search(' %s ' % dependency['dependentGloss']):
                    # disease is a dependent
                    features['dependent_%s_->_%s' % (dependency['governorGloss'],dependency['dependentGloss'])]+=1
    except:
        return features
    return features

def single_sentence_file_generator(file='negation_detection_sentences_train.txt'):
    df=pd.read_csv(negation_data.data_processed_folder + '/' + file, sep=',',header=0)
    for index, row in df.iterrows():
        yield index,row

def two_sentence_file_generator(file_1='negation_detection_sentences_train.txt',file_2='negation_detection_sentences_train.txt'):
    df=pd.read_csv(negation_data.data_processed_folder + '/' + file_1, sep=',',header=0)
    for index, row in df.iterrows():
        yield index,row
    df=pd.read_csv(negation_data.data_processed_folder + '/' + file_2, sep=',',header=0)
    for index, row in df.iterrows():
        yield index,row
        

def sentences_to_uth_ccb(word2vec_model, sentence_generator,feature_vectoriser=None):
    
    # Prepare for ConText features
    regex_features, no_match_vector = negation_data.load_context_feature_definitions()

    # cache the study phrases
    study_phrases = negation_data.get_study_phrases()
    
    # track corespondence between studies  & sentences : studyid-> list of indices
    study_indices={}

    feature_multisets=list() 
    y=list()
    n=0
    for index, row in sentence_generator:
        StudyId,PatientID,NoteId,CaseLabel,Sublabel,SentenceLabel,DiagnosisDate,SourceNoteRecordedDate,Sentence = row 
        n=n+1
        if n % 1000 == 0:
            print('finished %s sentences' % n)

        StudyId = str(StudyId) # lol python typing.  It seems to be typed as an integer from the file, and a string in the map.
        
        indices=study_indices.get(StudyId,[])
        indices.append(index)
        study_indices[StudyId]=indices
        
        # create input vector, drop tokens which are out of vocab
        tokens = negation_data.tokenize(Sentence)
        tokens = list(filter(lambda x: x in word2vec_model, tokens))
        # combine all features extracted from tokens
        combined_features = extract_dependency_features(study_phrases[StudyId], tokens) + extract_unigrams(StudyId, tokens) + extract_bigrams(StudyId, tokens) + extract_group_features(study_phrases[StudyId], StudyId, tokens) + extract_ConText_features(regex_features, tokens)
        
        # add the training example and loop
        feature_multisets.append(combined_features)
        y.append(SentenceLabel)

    if feature_vectoriser == None:
        feature_vectoriser = DictVectorizer()
        x = feature_vectoriser.fit_transform(feature_multisets)
        return (x,y,feature_vectoriser,study_indices)
    else:
        x = feature_vectoriser.transform(feature_multisets)
        return (x,y,feature_vectoriser,study_indices)
    
def hyperparameter_search(x_train,y_train,x_dev,y_dev,C=0.000001):
    from sklearn import preprocessing
    from sklearn.svm import LinearSVC

    svm = LinearSVC(C=C)
    svm.fit(x_train,y_train)
    y_pred=svm.decision_function(x_dev)
    auc=negation_data.auc(y_dev,y_pred)
    print('C=%s, AUC=%s' % (C,auc))
    return svm

# this combines primary and referral into 1 dataset
def run_uth_ccb_primary_and_referral_combined():
    from sklearn import preprocessing
    from sklearn.svm import LinearSVC

    word2vec_model = negation_data.load_word2vec_sg() # used to ensure tokens are the same across both experiments
    
    def get_generator(dataset='train'):
        return two_sentence_file_generator(file_1='negation_detection_sentences_experiment_%s_%s.txt' % (8,dataset),
                                           file_2='negation_detection_sentences_experiment_%s_%s.txt' % (9,dataset))
    # gen features & data sets
    (x_train,y_train,feature_vectoriser_train,study_indices_train) = sentences_to_uth_ccb(word2vec_model,get_generator('train'))
    (x_dev,y_dev,_,study_indices_dev) = sentences_to_uth_ccb(word2vec_model,get_generator('dev'),feature_vectoriser=feature_vectoriser_train)
    (x_test,y_test,feature_vectoriser_test,study_indices_test) = sentences_to_uth_ccb(word2vec_model,get_generator('test'),feature_vectoriser=feature_vectoriser_train)

    # how to eval on gold standard
    #(x_test_0,y_test_0, feature_vectoriser_gold_0) = sentences_to_uth_ccb(word2vec_model,file='gold standard 2 is 0.csv',feature_vectoriser=feature_vectoriser_train)
    #(x_test_1,y_test_1, feature_vectoriser_gold_1) = sentences_to_uth_ccb(word2vec_model,file='gold standard 2 is 1.csv',feature_vectoriser=feature_vectoriser_train)
    

    # scale features for SVM
    scaler = preprocessing.StandardScaler(with_mean=False).partial_fit(x_train)
    x_train = scaler.transform(x_train)
    x_dev = scaler.transform(x_dev)
    x_test=scaler.transform(x_test)
    #x_gold_0 = scaler.transform(x_gold_0)
    #x_gold_1 = scaler.transform(x_gold_1)

    # train
    #svm = LinearSVC(C=0.000001) # best results on dev set obtained with this value of C
    #svm.fit(x_train,y_train)

    # predict and Roc
    #y_pred = svm.decision_function(x_dev)
    #negation_data.roc(y_dev,y_pred, 'ROC dev set')
    #egation_data.roc(y_gold_0,svm.decision_function(x_gold_0), 'ROC: likely are 0')
    #negation_data.roc(y_gold_1,svm.decision_function(x_gold_1), 'ROC: likely are 1')

    return (word2vec_model,x_train,y_train,x_dev,y_dev,x_test,y_test,feature_vectoriser_train, study_indices_train,study_indices_dev,study_indices_test) 

def run_uth_ccb(experiment=9):
    from sklearn import preprocessing
    from sklearn.svm import LinearSVC

    word2vec_model = negation_data.load_word2vec_sg() # used to ensure tokens are the same across both experiments

    # gen features & data sets
    (x_train,y_train,feature_vectoriser_train,_) = sentences_to_uth_ccb(word2vec_model,file='negation_detection_sentences_experiment_%s_train.txt' % experiment)
    (x_dev,y_dev,_,_) = sentences_to_uth_ccb(word2vec_model,file='negation_detection_sentences_experiment_%s_dev.txt' %  experiment,feature_vectoriser=feature_vectoriser_train)
    (x_test,y_test,_,_) = sentences_to_uth_ccb(word2vec_model,file='negation_detection_sentences_experiment_%s_test.txt' % experiment,feature_vectoriser=feature_vectoriser_train)
    
    # how to eval on gold standard
    #(x_test_0,y_test_0, feature_vectoriser_gold_0) = sentences_to_uth_ccb(word2vec_model,file='gold standard 2 is 0.csv',feature_vectoriser=feature_vectoriser_train)
    #(x_test_1,y_test_1, feature_vectoriser_gold_1) = sentences_to_uth_ccb(word2vec_model,file='gold standard 2 is 1.csv',feature_vectoriser=feature_vectoriser_train)
    

    # scale features for SVM
    scaler = preprocessing.StandardScaler(with_mean=False).partial_fit(x_train)
    x_train = scaler.transform(x_train)
    x_dev = scaler.transform(x_dev)
    x_test=scaler.transform(x_test)
    #x_gold_0 = scaler.transform(x_gold_0)
    #x_gold_1 = scaler.transform(x_gold_1)

    # train
    svm = LinearSVC(C=0.000001) # best results on dev set obtained with this value of C
    svm.fit(x_train,y_train)

    # predict and Roc
    #y_pred = svm.decision_function(x_dev)
    #negation_data.roc(y_dev,y_pred, 'ROC dev set')
    #egation_data.roc(y_gold_0,svm.decision_function(x_gold_0), 'ROC: likely are 0')
    #negation_data.roc(y_gold_1,svm.decision_function(x_gold_1), 'ROC: likely are 1')

    return (word2vec_model,x_train,y_train,x_dev,y_dev,x_test,y_test,feature_vectoriser_train, svm,scaler) 


In [5]:
import gensim
import numpy as np

#(_,x_train,y_train,x_dev,y_dev,x_test,y_test,feature_vectoriser_train, svm,scaler) = run_uth_ccb(experiment=experiment)
(_,x_train,y_train,x_dev,y_dev,x_test,y_test,feature_vectoriser_train, study_indices_train,study_indices_dev,study_indices_test) = run_uth_ccb_primary_and_referral_combined()


  """


finished 1000 sentences
finished 2000 sentences
finished 3000 sentences
finished 4000 sentences
finished 5000 sentences
finished 6000 sentences
finished 7000 sentences
finished 8000 sentences
finished 9000 sentences
finished 10000 sentences
finished 11000 sentences
finished 12000 sentences
finished 13000 sentences
finished 14000 sentences
finished 15000 sentences
finished 16000 sentences
finished 17000 sentences
finished 18000 sentences
finished 19000 sentences
finished 20000 sentences
finished 21000 sentences
finished 22000 sentences
finished 23000 sentences
finished 24000 sentences
finished 25000 sentences
finished 26000 sentences
finished 27000 sentences
finished 28000 sentences
finished 29000 sentences
finished 30000 sentences
finished 31000 sentences
finished 32000 sentences
finished 33000 sentences
finished 34000 sentences
finished 35000 sentences
finished 36000 sentences
finished 37000 sentences
finished 38000 sentences
finished 39000 sentences
finished 40000 sentences
finished 

finished 321000 sentences
finished 322000 sentences
finished 323000 sentences
finished 324000 sentences
finished 325000 sentences
finished 326000 sentences
finished 327000 sentences
finished 328000 sentences
finished 329000 sentences
finished 330000 sentences
finished 331000 sentences
finished 332000 sentences
finished 333000 sentences
finished 334000 sentences
finished 335000 sentences
finished 336000 sentences
finished 337000 sentences
finished 338000 sentences
finished 339000 sentences
finished 340000 sentences
finished 341000 sentences
finished 342000 sentences
finished 343000 sentences
finished 344000 sentences
finished 345000 sentences
finished 346000 sentences
finished 347000 sentences
finished 348000 sentences
finished 349000 sentences
finished 350000 sentences
finished 351000 sentences
finished 352000 sentences
finished 353000 sentences
finished 354000 sentences
finished 355000 sentences
finished 356000 sentences
finished 357000 sentences
finished 358000 sentences
finished 359

finished 57000 sentences
finished 58000 sentences
finished 59000 sentences
finished 60000 sentences
finished 61000 sentences
finished 62000 sentences
finished 63000 sentences
finished 64000 sentences
finished 65000 sentences


In [6]:
# persist the x datasets (y datasets are the same across models, so they are saved by the CNN scripts)
# we are using sparse matrices here, so need to save in a special way

#https://stackoverflow.com/questions/8955448/save-load-scipy-sparse-csr-matrix-in-portable-data-format

import numpy as np

def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)

def load_sparse_csr(filename):
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])


def save_x(x,dataset='train',experiment=8):
    save_sparse_csr('data/datasets/x_%s_uth_exp%s.npy' % (dataset,experiment),x)
 
def save_study_sentence_dictionary(study_sentence_dictionary, dataset='train',experiment=8):
    with open('data/datasets/study_sentence_dictionary_UTH_%s_exp_%s.json' % (dataset, experiment), 'w') as file:
            json.dump(study_sentence_dictionary, file)

In [7]:


#save_x(x_train,dataset='train',experiment=experiment)
#save_x(x_dev,dataset='dev',experiment=experiment)
#save_x(x_test,dataset='test',experiment=experiment)

In [8]:
import json

experiment = 'pri_ref'

save_x(x_train,dataset='train',experiment=experiment)
save_x(x_dev,dataset='dev',experiment=experiment)
save_x(x_test,dataset='test',experiment=experiment)

save_study_sentence_dictionary(study_indices_train,dataset='train',experiment=experiment)
save_study_sentence_dictionary(study_indices_dev,dataset='dev',experiment=experiment)
save_study_sentence_dictionary(study_indices_test,dataset='test',experiment=experiment)


In [None]:
y_pred = svm.decision_function(x_dev)
negation_data.auc(y_dev,y_pred)