In [1]:
import spacy

In [2]:
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')




In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
import pandas as pd
data = pd.read_csv('data/ner_tagged.tsv', sep="\t")

In [5]:
data.rename(columns = {'O':'Tags','-DOCSTART-':'Tokens','-X-':'X'}, inplace = True)

In [6]:
for i, row in data.reset_index().iterrows():
    if type(row.Tokens) == str:
        if row.Tokens.lower().startswith("un") and "-N-" in row.Tags and row.Tokens != "undermounted":
            data.at[i,'Tags'] = row.Tags.replace("N-", "")
    if type(row.Tokens) == str:
        if row.Tokens.lower().endswith("less") or row.Tokens.lower().endswith("less.") and row.Tokens.lower() not in ["screw", "less"] and "-N-" in row.Tags:
            data.at[i,'Tags'] = row.Tags.replace("N-", "")

In [7]:
import numpy as np
is_negative = []
for i, row in data.reset_index().iterrows():
    if type(row.Tokens) == str:
        if "-N-" in row.Tags:
            is_negative.append(True)
            data.at[i, "Tags"] = row.Tags.replace("N-", "")
        else:
            is_negative.append(False)
    else:
        is_negative.append(np.NaN)

In [8]:
data["is_negative"] = is_negative

In [9]:
def get_triples(df):
    
    current_sent = []
    
    all_sents = []
    
    for i, row in df.reset_index().iterrows():
        if type(row.Tags) == str:
            current_sent.append((row.Tokens, row.Tags, row.is_negative))
        else:
            all_sents.append(current_sent)
            current_sent = []
            
    return(all_sents)

In [10]:
all_sents = get_triples(data)

neg_sents = []

for sent in all_sents:
    for word, iob, neg in sent:
        if neg == True:
            neg_sents.append(sent)
            break

In [11]:
def transform_sentence(sent, for_evaluation=False):
    
    list_counter = 0

    final_list = []

    current_list = []

    entity_indices = []
    
    pos_indices = []
    
    neg_indices = []
    
    pos_ent_names = []
    
    neg_ent_names = []

    for i in range(1, len(sent)):
        first_word = sent[i-1][0]
        first_iob = sent[i-1][1][0]
        second_word = sent[i][0]
        second_iob = sent[i][1][0]
        if for_evaluation:
            first_neg = sent[i-1][2]
            second_neg = sent[i][2]

        if len(current_list) == 0:
            current_list.append(first_word)

        if (first_iob == second_iob or second_iob == "I") and second_iob != "B":
            current_list.append(second_word)

        else:
            if first_iob != "O":
                entity_indices.append(list_counter)
                if for_evaluation:
                    if first_neg == True:
                        neg_indices.append(list_counter)
                    else:
                        pos_indices.append(list_counter)

            final_list.append(current_list)
            current_list = []
            list_counter += 1

            if i == len(sent)-1:
                final_list.append([second_word])
                if second_iob == "B":
                    entity_indices.append(list_counter)
                    if for_evaluation:
                        if second_neg == True:
                            neg_indices.append(list_counter)
                        else:
                            pos_indices.append(list_counter)

    if current_list != [] and second_iob == "I":
        final_list.append(current_list)
        entity_indices.append(list_counter)
        if for_evaluation:
            if second_neg == True:
                neg_indices.append(list_counter)
            else:
                pos_indices.append(list_counter)
        
    if for_evaluation:
        for ent_idx in pos_indices:
            for i, phrase in enumerate(final_list):
                if i == ent_idx:
                    pos_ent_names.append(" ".join(word for word in final_list[i]))

        for ent_idx in neg_indices:
            for i, phrase in enumerate(final_list):
                if i == ent_idx:
                    neg_ent_names.append(" ".join(word for word in final_list[i]))
                    
        return pos_ent_names, neg_ent_names
    
    else:
        return final_list, entity_indices

In [12]:
transform_sentence(neg_sents[3], for_evaluation=True)

(['soft', 'Open face', 'cardigan'], ['buttons.'])

In [13]:
def prepare_gold_data(sent_list, output_style="tags", word_of_interest=None):
    
    if word_of_interest:
        active_list = []
        for sent in sent_list:
            for word, iob, tag in sent:
                if word.lower() == word_of_interest:
                    active_list.append(sent)
                    break
    
    else:
        active_list = sent_list
    
    if output_style == "tags":
        outputs = []
        for sent in active_list:
            out = []
            for word, iob, neg in sent:
                out.append(neg)
            outputs.append(out)
        return outputs
    
    elif output_style == "entities":
        pos_ents = []
        neg_ents = []
        
        for sent in active_list:
            sent_pos_ents, sent_neg_ents = transform_sentence(sent, for_evaluation=True)
            pos_ents.append(sent_pos_ents)
            neg_ents.append(sent_neg_ents)
            
        return pos_ents, neg_ents

In [14]:
print(prepare_gold_data(neg_sents, output_style="entities")[0][:10])
print(prepare_gold_data(neg_sents, output_style="entities")[1][:10])

[['pre workout Pump addict'], ['free standing', 'tubs', 'matte', 'white'], ['throw blankets.', 'Boucle wool-like', 'throw', 'soft.', 'wool-like', 'soft', 'blanket'], ['soft', 'Open face', 'cardigan'], ['oval', 'tubs', 'matte', 'white', 'higher prices.'], ['vanity tops', 'wall mount', 'sink'], ['60" wide', 'tub', 'soaker type', 'deeper'], ['pure white', 'alcove', 'bathtub,', 'American standard'], ['navy', 'mirror', '60x31.', 'navy.'], ['floating', 'vanity', 'integrated sink', 'counter top', 'faucet', 'faucets', 'wall mounted.']]
[['Karbolyn Hydrate'], ['glossy'], ['scratchy.'], ['buttons.'], ['gloss?'], ['faucet holes?'], ['regular tub'], ['off whites.'], ['gold', 'black'], ['drill holes']]


In [15]:
def evaluate(preds, gold, input_style="tags"):
    
    total = 0
    true_pos = 0
    false_pos = 0
    true_neg = 0
    false_neg = 0
    
    if input_style == "tags":
        for i, pred in enumerate(preds):
            for j, tag in enumerate(pred):
                total += 1
                if tag == True and gold[i][j] == True:
                    true_pos += 1
                elif tag == True and gold[i][j] == False:
                    false_pos += 1
                elif tag == False and gold[i][j] == True:
                    false_neg += 1
                elif tag == False and gold[i][j] == False:
                    true_neg += 1
                
    if input_style == "entities":
        pred_pos_ents = preds[0]
        pred_neg_ents = preds[1]
        gold_pos_ents = gold[0]
        gold_neg_ents = gold[1]
        
        for i in range(len(pred_pos_ents)):
            for ent in pred_pos_ents[i]:
                total += 1
                if ent in gold_pos_ents[i]:
                    true_pos += 1
                    gold_pos_ents[i].remove(ent)
                else:
                    false_pos += 1
            for ent in pred_neg_ents[i]:
                total += 1
                if ent in gold_pos_ents[i]:
                    false_neg += 1
                    gold_neg_ents[i].remove(ent)
                else:
                    true_neg += 1
                    
    correct = true_pos + true_neg
    accuracy = correct / total
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    f1 = 2 * (precision * recall) / (precision + recall)
    
    print("Evaluation config:", input_style)
    print("Accuracy:", "{:.0%}".format(accuracy))
    print("Precision:", '{:.0%}'.format(precision))
    print("Recall:", '{:.0%}'.format(recall))
    print("F1:", '{:.0%}'.format(f1))

In [16]:
import regex as re

def get_spacy_tokens(transformed_sent, entity_indices):
    """Maps entity indices to indices in a spacy object."""
    
    ent_2_spacy = {}
    string_sent = " ".join([item for sublist in transformed_sent for item in sublist])
    doc = nlp(string_sent)
    
    spacy_special = ["'", '"', ":", ";", ",", "?", "!", ".", "n't", "'m"]
    
    for ent_idx in entity_indices:
        substring = " ".join([item for item in transformed_sent[ent_idx]])
        for char in spacy_special:
            if substring.endswith(char):
                substring = substring[:-len(char)]
        
        if str(doc).count(substring) == 1:
            end = str(doc).index(substring) + len(substring)
            start = end - len(substring.split()[-1])
            span = doc.char_span(start, end)[0]
            index = span.i

        elif str(doc).count(substring) > 1:
            best_guess_distance = 10000
            original = len(" ".join([item for sublist in transformed_sent[:ent_idx] for item in sublist])) + 1
            for idx in [_.start() for _ in re.finditer(substring, string_sent)]:
                if abs(idx - original) < best_guess_distance:
                    best_guess_distance = abs(idx - original)
                    best_guess = idx
            end = best_guess + len(substring)
            start = end - len(substring.split()[-1])
            span = doc.char_span(start, end)[0]
            index = span.i
        
        ent_2_spacy[ent_idx] = index
        
    return ent_2_spacy

In [17]:
def predict_one_sentence(sent, negators, output_style="tags", word_of_interest=None):
    
    if len(sent[0]) == 3:
        active_sent = []
        for word, iob, neg in sent:
            active_sent.append((word, iob))
            
    if word_of_interest == None:
        found = True
            
    else:
        found = False
        
    for word, iob in active_sent:
        if word.lower() == word_of_interest:
            found = True
            break
    
    if found:
        
        transformed_sent, entity_indices = transform_sentence(sent)

        spacy_mapping = get_spacy_tokens(transformed_sent, entity_indices)

        if output_style == "tags":
            out = [0] * len(sent)
            for negator in negators:
                neg_indices = negator(transformed_sent, spacy_mapping, output_style)
                for idx in neg_indices:
                    out[idx] += 1

            for i, tag in enumerate(out):
                if tag % 2 == 0:
                    out[i] = False
                else:
                    out[i] = True

            return out

        if output_style == "entities":
            pos_ents = []
            neg_ents = []
            for negator in negators:
                # Note that the below gives indices of entities in transformed_sent, not the indices themselves
                negator_pos_ents, negator_neg_ents = negator(transformed_sent, spacy_mapping, output_style=output_style)
                for ent_idx in negator_pos_ents:
                    if ent_idx not in pos_ents and ent_idx not in neg_ents:
                        pos_ents.append(ent_idx)
                for ent_idx in negator_neg_ents:
                    if ent_idx in pos_ents:
                        pos_ents.remove(ent_idx)
                        neg_ents.append(ent_idx)
                    elif ent_idx not in neg_ents:
                        neg_ents.append(ent_idx)
                    else:
                        neg_ents.remove(ent_idx)
                        pos_ents.append(ent_idx)

            pos_ent_names = []
            neg_ent_names = []

            for ent_idx in pos_ents:
                for i, phrase in enumerate(transformed_sent):
                    if i == ent_idx:
                        pos_ent_names.append(" ".join(word for word in transformed_sent[i]))

            for ent_idx in neg_ents:
                for i, phrase in enumerate(transformed_sent):
                    if i == ent_idx:
                        neg_ent_names.append(" ".join(word for word in transformed_sent[i]))

            return pos_ent_names, neg_ent_names


In [18]:
def predict_sent_list(sent_list, negators, output_style="tags", word_of_interest=None):
            
    if output_style == "tags":
        
        preds = []
        
        for sent in sent_list:
            sent_preds = predict_one_sentence(sent, negators, output_style, word_of_interest)
            if sent_preds is not None:
                preds.append(predict_one_sentence(sent, negators, output_style, word_of_interest))
            
        return preds
            
    if output_style == "entities":
        all_pos = []
        all_neg = []
        
        for sent in sent_list:
            preds = predict_one_sentence(sent, negators, output_style, word_of_interest)
            if preds is not None:
                pos_ents, neg_ents = preds
                all_pos.append(pos_ents)
                all_neg.append(neg_ents)
            
        return all_pos, all_neg

In [19]:
def get_negator_output(transformed_sent, entity_indices, negated_indices, output_style):
    
    if output_style == "tags":
        preds = []
        k = 0
        for i, sublist in enumerate(transformed_sent):
            if i in negated_indices:
                for j in range(k, k+len(sublist)):
                    preds.append(j)
            k += len(sublist)
                    
        return preds
    
    if output_style == "entities":
        return list(set(entity_indices) - set(negated_indices)), negated_indices
                    

In [20]:
t,e = transform_sentence(neg_sents[0])
n = [3]
s = "entities"
print(t)
print(get_negator_output(t, e, n, s))

[['Looking', 'for'], ['pre', 'workout', 'Pump', 'addict'], ['instead', 'of'], ['Karbolyn', 'Hydrate']]
([1], [3])


In [21]:
t,e = transform_sentence(neg_sents[0])
n = [3]
s = "tags"

get_negator_output(t, e, n, s)

[8, 9]

In [22]:
def instead(transformed_sent, spacy_mapping, output_style="tags"):
    
    string_sent = " ".join([item for sublist in transformed_sent for item in sublist])
    
    doc = nlp(string_sent)
    
    negated_indices = []
    
    for ent_idx, spacy_idx in spacy_mapping.items():
        i = spacy_idx
        root_hits = 0
        negated = False
        while root_hits != 2 and negated == False:  # i.e., while there is a head. In spacy, the main clause verb is its own head
            i = doc[i].head.i
            if i == doc[i].head.i:
                root_hits += 1
            if str(doc[i]) == "of" and str(doc[i-1]) == "instead":
                negated = True
                
        if negated:
            negated_indices.append(ent_idx)
            
        else:
            for i in range(6):
                if spacy_idx - i >= 0:
                    if doc[spacy_idx-i].text == "instead":
                        negated_indices.append(ent_idx)
                                                        
    return get_negator_output(transformed_sent, spacy_mapping.keys(), negated_indices, output_style)

In [23]:
trans, ents = transform_sentence(neg_sents[0])
spacy_mapping = get_spacy_tokens(trans, ents)

instead(trans, spacy_mapping, "entities")

([1], [3])

In [24]:
predict_one_sentence(neg_sents[0], [instead], "tags")


[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False]

In [25]:
instead_preds = predict_sent_list(neg_sents, [instead], "entities", word_of_interest="instead")

In [26]:
instead_preds

([['pre workout Pump addict'],
  ['free standing', 'tubs', 'matte', 'white'],
  ['sheer curtains', 'hooks']],
 [['Karbolyn Hydrate'], ['glossy'], ['rod']])

In [27]:
instead_gold = prepare_gold_data(neg_sents, "entities", word_of_interest="instead")

In [28]:
instead_gold

([['pre workout Pump addict'],
  ['free standing', 'tubs', 'matte', 'white'],
  ['sheer curtains', 'hooks']],
 [['Karbolyn Hydrate'], ['glossy'], ['rod']])

In [29]:
evaluate(instead_preds, instead_gold, "entities")

Evaluation config: entities
Accuracy: 100%
Precision: 100%
Recall: 100%
F1: 100%


In [30]:
def predict_and_evaluate(sent_list, negators, output_type="tags", word_of_interest=None):

    preds = predict_sent_list(sent_list, negators, output_type, word_of_interest)
    gold = prepare_gold_data(sent_list, output_type, word_of_interest)
    
    evaluate(preds, gold, output_type)
    

In [31]:
predict_and_evaluate(neg_sents, [instead], "entities", "instead")

Evaluation config: entities
Accuracy: 100%
Precision: 100%
Recall: 100%
F1: 100%


In [32]:
predict_and_evaluate(neg_sents, [instead], "tags", "instead")

Evaluation config: tags
Accuracy: 100%
Precision: 100%
Recall: 100%
F1: 100%


### Reasons why scope detection is not always successful (based on partial sample of sentences):
- No negator:
  - instead (3)
  - without (4)
  - comparative (1) (e.g. "deeper than a regular-True tub")
  - negation scope would include entities we don't want to be negated (2) (e.g., "counter top with no drill-True holes-True for the faucet-False")
  - can X be removed (1)
  - negative affix (1)
- Miscellaneous:
  - "I would like to know if the wool-like top side is soft as well, or if it is scratchy-True"
  - "You have it for the gold-True and black-True but i don't [sic] want it for the navy"
  - "I found a vanity top I love but it has only one-True hole-True for taps and that doesn't suit our needs"
 
 
Not sure:
- I'm looking for X, not Y or Z-True

## Ways of indicating negation (incomplete):

- "don't want (any)"
- "no"
- "no more than"
- "less than"
- "un-"
- "without"
- "only"
- "-less"
- "instead of"
- "too" (what comes before is negated, e.g. in "pink is too light", "pink" is negated)
- "not too" (what comes after is negated, e.g. in "not too light", "light" is negated)
- "the website only gives the option of"
- "without X or Y" (two separately annotated entities separated by conjunction)