In [36]:
# imports
import os
from dotenv import load_dotenv
import regex
import string
import pandas as pd
from typing import List, Union
import nltk
nltk.download('all')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\minou\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\minou\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\minou\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\minou\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\minou\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping grammars\basque_grammars.zip.
[nltk_data]   

In [37]:
# Load environment variables
load_dotenv(override=True)

# Retrieval variables
queries_path = os.getenv("QUERIES_PATH")

In [38]:
"""
adapted from chemdataextractor.text.normalize
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Tools for normalizing text.
https://github.com/mcs07/ChemDataExtractor
:copyright: Copyright 2016 by Matt Swain.
:license: MIT

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
'Software'), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""

#: Control characters.
CONTROLS = {
    '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u000e', '\u000f', '\u0011',
    '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', '\u001b',
}
# There are further control characters, but they are instead replaced with a space by unicode normalization
# '\u0009', '\u000a', '\u000b', '\u000c', '\u000d', '\u001c',  '\u001d', '\u001e', '\u001f'


#: Hyphen and dash characters.
HYPHENS = {
    '-',  # \u002d Hyphen-minus
    '‐',  # \u2010 Hyphen
    '‑',  # \u2011 Non-breaking hyphen
    '⁃',  # \u2043 Hyphen bullet
    '‒',  # \u2012 figure dash
    '–',  # \u2013 en dash
    '—',  # \u2014 em dash
    '―',  # \u2015 horizontal bar
}

#: Minus characters.
MINUSES = {
    '-',  # \u002d Hyphen-minus
    '−',  # \u2212 Minus
    '－',  # \uff0d Full-width Hyphen-minus
    '⁻',  # \u207b Superscript minus
}

#: Plus characters.
PLUSES = {
    '+',  # \u002b Plus
    '＋',  # \uff0b Full-width Plus
    '⁺',  # \u207a Superscript plus
}

#: Slash characters.
SLASHES = {
    '/',  # \u002f Solidus
    '⁄',  # \u2044 Fraction slash
    '∕',  # \u2215 Division slash
}

#: Tilde characters.
TILDES = {
    '~',  # \u007e Tilde
    '˜',  # \u02dc Small tilde
    '⁓',  # \u2053 Swung dash
    '∼',  # \u223c Tilde operator #in mbert vocab
    '∽',  # \u223d Reversed tilde
    '∿',  # \u223f Sine wave
    '〜',  # \u301c Wave dash #in mbert vocab
    '～',  # \uff5e Full-width tilde #in mbert vocab
}

#: Apostrophe characters.
APOSTROPHES = {
    "'",  # \u0027
    '’',  # \u2019
    '՚',  # \u055a
    'Ꞌ',  # \ua78b
    'ꞌ',  # \ua78c
    '＇',  # \uff07
}

#: Single quote characters.
SINGLE_QUOTES = {
    "'",  # \u0027
    '‘',  # \u2018
    '’',  # \u2019
    '‚',  # \u201a
    '‛',  # \u201b

}

#: Double quote characters.
DOUBLE_QUOTES = {
    '"',  # \u0022
    '“',  # \u201c
    '”',  # \u201d
    '„',  # \u201e
    '‟',  # \u201f
}

#: Accent characters.
ACCENTS = {
    '`',  # \u0060
    '´',  # \u00b4
}

#: Prime characters.
PRIMES = {
    '′',  # \u2032
    '″',  # \u2033
    '‴',  # \u2034
    '‵',  # \u2035
    '‶',  # \u2036
    '‷',  # \u2037
    '⁗',  # \u2057
}

#: Quote characters, including apostrophes, single quotes, double quotes, accents and primes.
QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES

def normalize(text):
    for control in CONTROLS:
        text = text.replace(control, '')
    text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')

    for hyphen in HYPHENS | MINUSES:
        text = text.replace(hyphen, '-')
    text = text.replace('\u00ad', '')

    for double_quote in DOUBLE_QUOTES:
        text = text.replace(double_quote, '"')  # \u0022
    for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
        text = text.replace(single_quote, "'")  # \u0027
    text = text.replace('′', "'")     # \u2032 prime
    text = text.replace('‵', "'")     # \u2035 reversed prime
    text = text.replace('″', "''")    # \u2033 double prime
    text = text.replace('‶', "''")    # \u2036 reversed double prime
    text = text.replace('‴', "'''")   # \u2034 triple prime
    text = text.replace('‷', "'''")   # \u2037 reversed triple prime
    text = text.replace('⁗', "''''")  # \u2057 quadruple prime

    text = text.replace('…', '...').replace(' . . . ', ' ... ')  # \u2026

    for slash in SLASHES:
        text = text.replace(slash, '/')

    for tilde in TILDES:
       text = text.replace(tilde, '~')

    return text


In [39]:
# Normalization adapted from SQuAD evaluation script https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
def remove_articles(text: str) -> str:
    """
    Removes articles ('a', 'an', 'the') from the text.
    """
    return regex.sub(r'\b(a|an|the)\b', ' ', text)

def white_space_fix(text: str) -> str:
    """
    Fixes extra whitespace in the text by collapsing multiple spaces into one.
    """
    return ' '.join(text.split())

def remove_punc(text: str) -> str:
    """
    Removes punctuation from the text and replaces it with a space.
    """
    for punct in string.punctuation:
        text = text.replace(punct, ' ')
    return text

def lower(text: str) -> str:
    """
    Converts all characters in the text to lowercase.
    """
    return text.lower()

def normalize_answer(s: str, lowercase: bool = True) -> str:
    """
    Normalizes answers by removing articles, punctuation, fixing whitespace, and optionally converting to lowercase.
    """
    if lowercase:
        s = lower(s)
    s = normalize(s)
    return white_space_fix(remove_articles(remove_punc(s)))

In [67]:
def retrieve_hard_negative_context(context: List[List[Union[List[str],str]]], evidence: List[List[str]], question: str, k = 2) -> bool:
    """
    Finds the hard negative context from the context list that is not present in the evidence list.
    """
    evidence_list = [[part1, part2] for [part1, _, part2] in evidence] # omit the relationship type
    normalized_evidence_list_lower = [[normalize_answer(part1, lowercase=True),normalize_answer(part2, lowercase=True)] for [part1, part2] in evidence_list]
    normalized_question = normalize_answer(question, lowercase=True)
    #print(evidence_list)
    print(question)

    hard_negative_context = []
    cosine_similarity_scores = []

    for [topic, contexts] in context:
        normalized_context_lower = [normalize_answer(context, lowercase=True) for context in contexts]
        normalized_context_lower.append(normalize_answer(topic, lowercase=True))
        # concatenate the topic and context list to a single string
        normalized_context_string = ' '.join(normalized_context_lower)
        
        context_relevant = False
        for [part1, part2] in normalized_evidence_list_lower:
            if part1 in normalized_context_string and part2 in normalized_context_string:
                #print("relevant context found: ", part1, part2)
                context_relevant = True
                break
            
        if not context_relevant: 
            cosine_similarity_score = cosine_similarity(normalized_question, normalized_context_string)
            cosine_similarity_scores.append(cosine_similarity_score)
            hard_negative_context.append([topic, [contexts]])
            print(len(normalized_context_string), cosine_similarity_score,normalized_context_string)
            
  
    # Now get the k hard negative contexts that have the highest dot product score with the question
    # Sort the hard negative contexts by the cosine similarity score
    hard_negative_context = [context for _, context in sorted(zip(cosine_similarity_scores, hard_negative_context), reverse=True)]
    hard_negative_context = hard_negative_context[:k]
    
    return hard_negative_context


# implementation adapted from: https://www.geeksforgeeks.org/python-measure-similarity-between-two-sentences-using-cosine-similarity/
def cosine_similarity(X: str, Y: str) -> float:
    """
    Computes the cosine similarity between two strings.
    """
    # tokenization 
    X_list = word_tokenize(X)  
    Y_list = word_tokenize(Y) 
    
    # sw contains the list of stopwords 
    sw = stopwords.words('english')  
    l1 =[];l2 =[] 
    
    # remove stop words from the string 
    X_set = {w for w in X_list if not w in sw}  
    Y_set = {w for w in Y_list if not w in sw} 
    
    # form a set containing keywords of both strings  
    rvector = X_set.union(Y_set)  
    for w in rvector: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y_set: l2.append(1) 
        else: l2.append(0) 
    c = 0
    
    # cosine formula  
    for i in range(len(rvector)): 
            c+= l1[i]*l2[i] 
    cosine = c / float((sum(l1)*sum(l2))**0.5) 
    return c + cosine

In [41]:
def retrieve_golden_context(context, evidence: List[List[str]]) -> bool:
    """
    Finds the golden context from the context list that is not present in the evidence list.
    """
    evidence_list = [[part1, part2] for [part1, _, part2] in evidence] # omit the relationship type
    normalized_evidence_list_lower = [[normalize_answer(part1, lowercase=True),normalize_answer(part2, lowercase=True)] for [part1, part2] in evidence_list]
    #print(evidence_list)

    golden_context = []
    for [topic, contexts] in context:
        normalized_context_lower = [normalize_answer(context, lowercase=True) for context in contexts]
        normalized_context_lower.append(normalize_answer(topic, lowercase=True))
        # concatenate the topic and context list to a single string
        context_string = ' '.join(normalized_context_lower)
        
        context_relevant = False
        for [part1, part2] in normalized_evidence_list_lower:
            if part1 in context_string and part2 in context_string:
                #print("relevant context found: ", part1, part2)
                golden_context.append([topic, [contexts]])
                break
 
    return golden_context


In [68]:
# run retrieve_hard_negative_context on dev_test.json
data = pd.read_json(queries_path)
for i,query in data.iterrows():
    context = query['context']
    evidence = query['evidences']
    question = query['question']
    hard_negative_context_k2 = retrieve_hard_negative_context(context, evidence, question, k=2)
    hard_negative_context_k1 = hard_negative_context_k2[0]
    #golden_context = retrieve_golden_context(context, evidence)
    print("hard_negative_context: ", hard_negative_context_k2)
    #print("golden_context: ", golden_context)


Who is the mother of the director of film Polish-Russian War (Film)?
323 1.0758098043578903 maheen khan is pakistani fashion and costume designer also award winner fashion designer for fashion labels like embroidery housemaheen and gulabo she has done many national and international fashion events and shows she undertook embroidery for film snow white and huntsman and television series jewel in crown maheen khan
200 2.174077655955698 viktor petrovich yeliseyev born june 9 1950 is russian general orchestra conductor and music teacher he is director of ministry of interior ensemble one of two russian red army choirs viktor yeliseyev
224 1.087038827977849 alice washburn 1860 1929 was american stage and film actress she worked at edison vitagraph and kalem studios her final film snow white was her only known feature film she died of heart attack in november 1929 alice washburn
53 1.1825741858350554 she was mother of prince morinaga minamoto no chikako
609 1.0560772154092044 snow white chri