In [14]:
!pip install pywsd
!pip install -U wn==0.0.22
!pip install datasets
!pip install scikit-learn
!pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-7.6.5-py2.py3-none-any.whl (121 kB)
[K     |████████████████████████████████| 121 kB 17.2 MB/s eta 0:00:01
Collecting widgetsnbextension~=3.5.0
  Downloading widgetsnbextension-3.5.1-py2.py3-none-any.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 58.5 MB/s eta 0:00:01
Collecting jupyterlab-widgets>=1.0.0
  Downloading jupyterlab_widgets-1.0.2-py3-none-any.whl (243 kB)
[K     |████████████████████████████████| 243 kB 54.8 MB/s eta 0:00:01
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-7.6.5 jupyterlab-widgets-1.0.2 widgetsnbextension-3.5.1


In [16]:
from find_golden import create_embedding_dict, NonSpecificity_Scorer
from utils import filter_lemma, filter_similarity, filter_syn_ant
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from nltk import pos_tag


In [30]:
def main(): 
    #Change dataset here to use something else
    xsum = load_dataset('xsum', split='train')

    #Create the embedding dict used for the similarity filter
    embeddings_dict = create_embedding_dict("glove.6B.50d.txt")

    #If we want to process the whole dataset:
    nb_summaries = len(xsum)
    gold_answers = []
    #nb_summaries = 100
    no_ans = 0
    one_ans = 0
    more_than_one_ans = 0

    for i in range(nb_summaries):
        answers = []
        summary_score = {}

        #Change here if you use a dataset with another structure than xsum
        passage = xsum[i]["document"]
        summary = xsum[i]["summary"]
        id_doc = xsum[i]["id"]
        tokenized_passage = word_tokenize(passage)
        tokenized_summary = word_tokenize(summary)
        tagged_summary = pos_tag(tokenized_summary)
        filtered1_answers, filtered2_answers, filtered3_answers = [], [], []

        for word in tagged_summary:
            tag = word[1]
            #Keeping only noun and verbs for subtask 2
            if tag == "NN" or tag == "VB":
                score = NonSpecificity_Scorer(passage, summary, word[0])
                #summary_score.append([word[0], score])
                summary_score[word[0]] = score
                #Filter on score
                if score < 6:
                    answers.append(word[0])
                    filtered1_answers = filter_lemma(tokenized_passage, answers)
                    filtered2_answers = filter_syn_ant(tokenized_passage, filtered1_answers)
                    filtered3_answers = filter_similarity(embeddings_dict, tokenized_passage, filtered2_answers)

        if len(filtered3_answers) == 0:
            no_ans += 1
            gold_answer = ""
        elif len(filtered3_answers) == 1:
            one_ans += 1
            gold_answer = filtered3_answers[0]
        elif len(filtered3_answers) > 1:
            more_than_one_ans += 1
            #If we have multiple possible answer after all the filter, the final gold option is the one with the lowest score
            dict_temp = {}
            for answer in filtered3_answers:
                dict_temp[answer] = summary_score[answer]
            #If same score, we pick the first one
            gold_answer = min(dict_temp, key=dict_temp.get)

        # f.write("Summary used : {} \n".format(summary))
        # f.write("id of the summary : {} \n".format(id_doc))
        # f.write("{} \n".format(summary_score))
        # f.write("First batch of answers after NS Score filtering : {} \n".format(answers))
        # f.write("Second batch of answers after Lemma filter : {} \n".format(filtered1_answers))
        # f.write("Third batch of answers after Synonym/Antomym filter : {} \n".format(filtered2_answers))
        # f.write("Fourth batch of answers after Similarity filter : {} \n".format(filtered3_answers))
        # f.write("The gold answer kept for this summary is : {} \r\n".format(gold_answer))
        gold_answers.append(gold_answer)
    print("After filtering {} summaries, there are :\n{} with no possible gold answer,\n{} with one possible golden answer,\n{} with more than one possible golden answer".format(nb_summaries, no_ans, one_ans, more_than_one_ans))
    return gold_answers

In [20]:
import time
with open("output_oracle_subtask2.txt", "w+") as f:
    t0= time.time()
    gold_answers = main()
    t1 = time.time() - t0
print("Time elapsed: ", t1)


Using custom data configuration default
Reusing dataset xsum (/home/willy/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)


After filtering 100 summaries, there are :
75 with no possible golden answer,
20 with one possible golden answer,
5 with more than one possible golden answer
Time elapsed:  21.86880350112915


In [None]:
gold_answers = main()
with open("gold_answers.txt", "w+") as f:
    f.write(str(gold_answers))

Using custom data configuration default
Reusing dataset xsum (/home/willy/.cache/huggingface/datasets/xsum/default/1.2.0/4957825a982999fbf80bca0b342793b01b2611e021ef589fb7c6250b3577b499)


In [29]:
with open("gold_answers.txt", "w+") as f:
    f.write(str(gold_answers))