In [1]:
import re
import pickle
import pandas as pd
import pyarrow.feather as feather
import spacy
nlp = spacy.load("en_core_web_sm")

import neuralcoref
#packages used in the step of removing questions with unclear pronouns
neuralcoref.add_to_pipe(nlp)

from python_heideltime.python_heideltime import Heideltime
#packages used in the step of transforming temporal information
#Install the following package in the current directory
#https://github.com/PhilipEHausner/python_heideltime
#Note that as there are some problems with the current SUTime version (which was actually used in the paper),
#we simply replace with heideltime for temporal transformation (which may result in different results).
heideltime_parser = Heideltime()
heideltime_parser.set_document_type('NEWS')
temp_extract_re = re.compile(r'type="DATE" value=(.*?)>(.*?)(</TIMEX3>)')
temp_trans_re = re.compile(r'<TIMEX3(.*?)</TIMEX3>')
temp_day_form="\d{4}-\d{2}-\d{2}"
temp_month_form="\d{4}-\d{2}"
temp_year_form="\d{4}"
idx2month=dict()
month_list=["January","February","March","April","May","June",\
"July","August","September","October","November","December"]
for i,month_v in enumerate(month_list):
    m_idx=str(i+1).zfill(2)
    idx2month[m_idx]=month_v

In [2]:
class Syntactic_Temporal_Processing:
    def __init__(
        self,
        min_ent_num: int, 
        max_ent_num: int, 
        min_token_num: int, 
        max_token_num: int, 
        paraid_pub_dict: dict,
    ):
        self.min_ent_num = min_ent_num
        self.max_ent_num = max_ent_num
        self.min_token_num = min_token_num
        self.max_token_num = max_token_num
        self.paraid_pub_dict = paraid_pub_dict
        
    def __call__(self, raw_results_df):
        self.results_df = raw_results_df
        self.remove_que_without_qmark()
        self.remove_que_ansinquestion()
        self.remove_que_duplicate()
        self.add_nlp_column()
        self.remove_que_entissue()
        self.remove_que_tokissue()
        self.remove_que_pronissue()
        self.add_transformation_columns()
        self.transform_que_temp()
        self.transform_ans_temp()
        return self.results_df
    
    def add_nlp_column(self):
        self.results_df['que_nlp'] = self.results_df['question'].apply(lambda x: nlp(x))
        
    def remove_que_without_qmark(self):
        #1. Remove questions that do not end with a question mark.
        self.results_df = self.results_df[self.results_df['question'].str.endswith('?')]
        self.results_df = self.results_df.reset_index(drop=True)
        
    def remove_que_ansinquestion(self):
        #2. Remove questions whose answers are explicitly indicated inside the questions’ content.
        self.results_df = self.results_df[self.results_df.apply(lambda x: x["answer"].lower() not in x["question"].lower(), axis=1)]
        self.results_df = self.results_df.reset_index(drop=True)
        
    def remove_que_duplicate(self):
        #3. Remove duplicate questions.
        self.results_df = self.results_df[~self.results_df.duplicated('question',keep=False)]
        self.results_df = self.results_df.reset_index(drop=True)
        
    def remove_que_entissue(self):
        #4. Remove questions that have too few or too many named entities.
        removed_index = []
        for row_i,row in self.results_df.iterrows():
            if min_ent_num<=len(row['que_nlp'].ents)<=max_ent_num:
                continue
            else:
                removed_index.append(row_i)
        self.results_df = self.results_df.drop(removed_index).reset_index(drop=True)
    
    def remove_que_tokissue(self):
        #5. Remove questions that are too short or too long.
        removed_index = []
        for row_i,row in self.results_df.iterrows():
            if min_token_num<=len(row['que_nlp'])<=max_token_num:
                continue
            else:
                removed_index.append(row_i)
        self.results_df = self.results_df.drop(removed_index).reset_index(drop=True)
    
    def remove_que_pronissue(self):
        #6. Remove questions with unclear pronouns
        question_set = set(self.results_df["question"])
        Pron_Processing = Pronouns_Question_Processing(question_set)
        good_pron_question, bad_pron_question = Pron_Processing()
        self.results_df = self.results_df[~self.results_df['question'].isin(bad_pron_question)].reset_index(drop=True)
    
    def add_transformation_columns(self):
        self.results_df["org_question"] = self.results_df["question"]
        self.results_df["trans_question"] = self.results_df["question"]
        self.results_df = self.results_df.drop(['question'], axis=1)
        self.results_df["org_answer"] = self.results_df["answer"]
        self.results_df["trans_answer"] = self.results_df["answer"]
        self.results_df = self.results_df.drop(['answer'], axis=1)
        self.results_df["trans_que"] = "0"
        self.results_df["trans_ans"] = "0"
        self.results_df = self.results_df[["org_question","trans_question","org_answer","trans_answer","ans_pos","ans-sent_pos","para_id","trans_que","trans_ans"]]
    
    def transform_que_temp(self):
        #7. Transform relative temporal information in questions to absolute temporal information.
        Temp_Processing = TempInfor_Transformation_Processing(self.results_df, self.paraid_pub_dict, "question")
        self.results_df = Temp_Processing()
    
    def transform_ans_temp(self):
        #8. Transform relative temporal information of the answers of generated questions to absolute temporal information.
        Temp_Processing = TempInfor_Transformation_Processing(self.results_df, self.paraid_pub_dict, "answer")
        self.results_df = Temp_Processing()

In [3]:
class Pronouns_Question_Processing:
    def __init__(self, question_set):
        self.question_set = question_set
        self.pron_tag_set=set(["PRP","PRP$"])
        self.results_df = question_set
        self.myyou_words_set=set(['i','we','me','us','mine','ours','my','our','myself','ourselves',\
                            'you','yours','your','yourself','yourselves'])
        
    def __call__(self):
        self.pronouns_questions_recognization()
        self.common_question_processing()
        self.pronque_nocoref_processing()
        good_question_set = self.good_common_question|self.good_nocoref_question
        bad_question_set = self.bad_common_question|self.bad_nocoref_question
        return [good_question_set, bad_question_set]
    
    def pronouns_questions_recognization(self):
        pron_tag_list=["PRP","PRP$"]
        self.pron_que_nlp_dict=dict()
        self.que_coref_dict=dict()
        for que in self.question_set:
            que_nlp = nlp(que)
            #1. Identify questions with pronouns
            pron_flag = False
            for token in que_nlp:
                if token.tag_ in pron_tag_list:
                    self.pron_que_nlp_dict[que] = que_nlp
                    pron_flag = True
                    break
            if not pron_flag:
                continue
            #2. Collect coreference information of questions with pronouns
            que_coref=que_nlp._.coref_clusters
            if len(que_coref)==0:
                continue
            que_coref_list=[]
            for core_infor in que_coref:
                main_ref=core_infor.main
                main_begin_idx=main_ref[0].idx
                main_end_idx=main_ref[-1].idx+len(main_ref[-1].text)
                assert main_ref.text==que[main_begin_idx:main_end_idx]

                cluster_ref_list=core_infor.mentions
                cluster_ref_postuple_list=[]
                for ref_ele in cluster_ref_list:
                    ref_ele_begin_idx=ref_ele[0].idx
                    ref_ele_end_idx=ref_ele[-1].idx+len(ref_ele[-1].text)
                    assert ref_ele.text==que[ref_ele_begin_idx:ref_ele_end_idx]
                    cluster_ref_postuple_list.append([ref_ele_begin_idx,ref_ele_end_idx])
                que_coref_list.append([(main_begin_idx,main_end_idx),cluster_ref_postuple_list])
            self.que_coref_dict[que]=que_coref_list
        #questions with pronouns and coreference information
        self.common_que_set=set(self.que_coref_dict.keys())
        #questions with pronouns only
        self.pronque_nocoref_set=set(self.pron_que_nlp_dict.keys())-self.common_que_set
        
    def common_question_processing(self):
        self.common_candidate_question = self.common_que_set
        self.good_common_question = set()
        self.bad_common_question = set()
        self.common_question_processing_step1()
        self.common_question_processing_step2()
        self.common_question_processing_step3()
        self.common_question_processing_step4()
        self.common_question_processing_step5()
        self.common_question_processing_step6()
        self.common_question_processing_step7()

    def common_question_processing_step1(self):
        for question in self.common_candidate_question:
            coref_infor=self.que_coref_dict[question]
            coref_pos_list=[]
            coref_postext_list=[]
            notmain_coref_pos_set=set()
            main_coref_pos_set=set()
            for pos_tuple in coref_infor:
                mainpos_tuple_beg=pos_tuple[0][0]
                mainpos_tuple_end=pos_tuple[0][1]
                coref_pos_list.append([f"{mainpos_tuple_beg}-{mainpos_tuple_end}"])
                main_coref_pos_set.add(f"{mainpos_tuple_beg}_{mainpos_tuple_end}")
                coref_postext_list.append([question[mainpos_tuple_beg:mainpos_tuple_end].lower()])
                for pos in pos_tuple[1]:
                    pos_beg=pos[0]
                    pos_end=pos[1]
                    if f"{pos_beg}-{pos_end}" in coref_pos_list[-1]:
                        continue
                    else:
                        coref_pos_list[-1].append(f"{pos_beg}_{pos_end}")
                        notmain_coref_pos_set.add(f"{pos_beg}_{pos_end}")
                        coref_postext_list[-1].append(question[pos_beg:pos_end].lower())
            nlp_infor=self.pron_que_nlp_dict[question]
            token_pronidx_set=set()
            token_prontext_list=[]
            for token in nlp_infor:
                if token.tag_ in self.pron_tag_set:
                    token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                    token_pronidx_set.add(token_pronidx_infor)
                    token_prontext_list.append(question[token.idx:token.idx+len(token.text)].lower())

            #Remove myyou_words cases
            token_prontext_lower_set=set(prontext.lower() for prontext in token_prontext_list)
            if token_prontext_lower_set.intersection(self.myyou_words_set):
                continue

            if len(token_pronidx_set-notmain_coref_pos_set)==0:
                self.good_common_question.add(question)
            else:
                check_remained_pronidx_set=token_pronidx_set-notmain_coref_pos_set
                not_common_coref_pos_set=notmain_coref_pos_set-token_pronidx_set
                correct_remained_pronidx_num=0
                for pronidx_tuple in check_remained_pronidx_set:
                    pronidx_beg,pronidx_end=list(map(int, pronidx_tuple.split("_")))
                    for corefidx_tuple in not_common_coref_pos_set:
                        corefidx_beg,corefidx_end=list(map(int, corefidx_tuple.split("_")))
                        if corefidx_beg<=pronidx_beg<=pronidx_end<=corefidx_end:
                            correct_remained_pronidx_num+=1
                if correct_remained_pronidx_num==len(check_remained_pronidx_set):
                    self.good_common_question.add(question)
                    """
                    print(question)            #Who does Greece want to protect outside of their country?
                    print(coref_postext_list)  #[['greece', 'their country']]
                    print(token_prontext_list) #['their']
                    """
        self.common_candidate_question=self.common_candidate_question-self.good_common_question-self.bad_common_question

    def common_question_processing_step2(self):
        who_re = re.compile(r"(Who|Whose|Whom) (.*?)(say|says|saying|said)(.*)")
        for question in self.common_candidate_question:
            re_result=who_re.match(question)
            if re_result:
                between_text=who_re.search(question).group(2)
                if len(between_text.strip())==0:
                    self.good_common_question.add(question)
                else:
                    between_text_pos_beg,between_text_pos_end=who_re.match(question).span(2)
                    nlp_infor=self.pron_que_nlp_dict[question]
                    myyoutoken_pronidx_set=set()
                    myyoutoken_prontext_list=[]
                    for token in nlp_infor:
                        if token.tag_ in self.pron_tag_set:
                            if question[token.idx:token.idx+len(token.text)] in self.myyou_words_set:
                                token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                                myyoutoken_pronidx_set.add(token_pronidx_infor)
                                myyoutoken_prontext_list.append(question[token.idx:token.idx+len(token.text)].lower())
                    add_question=True
                    for pronidx_tuple in myyoutoken_pronidx_set:
                        pronidx_beg,pronidx_end=list(map(int, pronidx_tuple.split("_")))
                        if between_text_pos_beg<=pronidx_beg<=pronidx_end<=between_text_pos_end:
                            add_question=False
                    if add_question:
                        self.good_common_question.add(question)

        self.common_candidate_question=self.common_candidate_question-self.good_common_question-self.bad_common_question
        
    def common_question_processing_step3(self):
        who_words_set = set(["who","whose","whom"])
        he_words_set=set(['he', 'him', 'himself', 'his'])
        she_words_set=set(['she', 'her','hers', 'herself'])
        they_words_set=set(['they','them','theirs','their','themselves'])

        for question in self.common_candidate_question:
            nlp_infor=self.pron_que_nlp_dict[question]
            token_text_set=set([token.text.lower() for token in nlp_infor])

            token_pronidx_set=set()
            token_prontext_set=set()
            for token in nlp_infor:
                if token.tag_ in self.pron_tag_set:
                    token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                    token_pronidx_set.add(token_pronidx_infor)
                    token_prontext_set.add(question[token.idx:token.idx+len(token.text)].lower())

            if token_text_set.intersection(who_words_set):
                if token_prontext_set.intersection(self.myyou_words_set):
                    continue
                else:
                    he_in_tag=False
                    she_in_tag=False
                    they_in_tag=False
                    if token_prontext_set.intersection(he_words_set):
                        he_in_tag=True
                    if token_prontext_set.intersection(she_words_set):
                        she_in_tag=True
                    if token_prontext_set.intersection(they_words_set):
                        they_in_tag=True
                    if int(he_in_tag)+int(she_in_tag)+int(they_in_tag)>1:
                        self.bad_common_question.add(question)
                        continue
                    self.good_common_question.add(question)

        self.common_candidate_question=self.common_candidate_question-self.good_common_question-self.bad_common_question
        
    def common_question_processing_step4(self):
        who_re2 = re.compile(r"(.+?)(\"(.+)\"|''(.+)'')(.*)")
        who_words_set = set(["who","whose","whom"])
        heshethey_words_set=set(['he', 'she', 'they', 'her', 'him', 'them', 'himself',\
                             'hers', 'theirs', 'his', 'their', 'herself', 'themselves'])
        select_ner_set=set(["PERSON","NORP","FAC","ORG","GPE"])

        for question in self.common_candidate_question:
            re_result=who_re2.match(question)
            if re_result:
                nlp_infor=self.pron_que_nlp_dict[question]
                token_text_set=set([token.text.lower() for token in nlp_infor])
                between_text_pos_beg,between_text_pos_end=who_re2.match(question).span(1)
                outside_token_pronidx_set=set()
                outside_token_prontext_set=set()
                for token in nlp_infor:
                    if token.tag_ in self.pron_tag_set:
                        pronidx_beg,pronidx_end=token.idx,token.idx+len(token.text)
                        if pronidx_end<=between_text_pos_end:
                            token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                            outside_token_pronidx_set.add(token_pronidx_infor)
                            outside_token_prontext_set.add(question[token.idx:token.idx+len(token.text)].lower())

                select_ner_idx=[]
                for ent_element in nlp_infor.ents:
                    ent_label=ent_element.label_
                    if ent_label in select_ner_set:
                        if between_text_pos_beg<=ent_element[0].idx<between_text_pos_end:
                            select_ner_idx.append(ent_element[0].idx)

                add_question=True
                for pronidx_tuple in outside_token_pronidx_set:
                    pronidx_beg,pronidx_end=list(map(int, pronidx_tuple.split("_")))
                    prontext=question[pronidx_beg:pronidx_end].lower()
                    if prontext in self.myyou_words_set:
                        add_question=False
                    if not token_text_set.intersection(who_words_set):
                        if prontext in heshethey_words_set:
                            add_question=False

                if add_question:
                    self.good_common_question.add(question)
                else:
                    self.bad_common_question.add(question)

        self.common_candidate_question=self.common_candidate_question-self.good_common_question-self.bad_common_question
    
    def common_question_processing_step5(self):
        for question in self.common_candidate_question:
            nlp_infor=self.pron_que_nlp_dict[question]
            token_pronidx_set=set()
            token_prontext_set=set()
            for token in nlp_infor:
                if token.tag_ in self.pron_tag_set:
                    token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                    token_pronidx_set.add(token_pronidx_infor)
                    token_prontext_set.add(question[token.idx:token.idx+len(token.text)].lower())
            for prontext in token_prontext_set:
                if prontext in self.myyou_words_set:
                    self.bad_common_question.add(question)

        self.common_candidate_question=self.common_candidate_question-self.good_common_question-self.bad_common_question

    def common_question_processing_step6(self):
        heshe_words_set=set(['he', 'she', 'they', 'her', 'him', 'them', 'himself',\
                     'hers', 'theirs', 'his', 'their', 'herself', 'themselves'])
        select_ner_set=set(["PERSON","NORP","FAC","ORG","GPE"])

        for question in self.common_candidate_question:
            nlp_infor=self.pron_que_nlp_dict[question]
            select_ner_idx=[]
            for ent_element in nlp_infor.ents:
                ent_label=ent_element.label_
                if ent_label in select_ner_set:
                    select_ner_idx.append(ent_element[0].idx)

            myyou_tag=False
            heshetoken_pronbegidx_set=set()
            heshetoken_prontext_list=[]
            for token in nlp_infor:
                if token.tag_ in self.pron_tag_set:
                    if question[token.idx:token.idx+len(token.text)].lower() in heshe_words_set:
                        heshetoken_pronbegidx_set.add(token.idx)
                        heshetoken_prontext_list.append(question[token.idx:token.idx+len(token.text)].lower())
                    if question[token.idx:token.idx+len(token.text)] in self.myyou_words_set:
                        myyou_tag=True

            add_question=False
            for neridx in select_ner_idx:
                for pronbeg in heshetoken_pronbegidx_set:
                    if neridx<pronbeg:
                        add_question=True
                        break
            if add_question and (not myyou_tag):
                self.good_common_question.add(question)

        self.common_candidate_question=self.common_candidate_question-self.good_common_question-self.bad_common_question
    
    def common_question_processing_step7(self):
        for question in self.common_candidate_question:
            nlp_infor=self.pron_que_nlp_dict[question]
            token_text_set=set([token.text.lower() for token in nlp_infor])
            token_pronidx_set=set()
            token_prontext_set=set()
            for token in nlp_infor:
                if token.tag_ in self.pron_tag_set:
                    token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                    token_pronidx_set.add(token_pronidx_infor)
                    token_prontext_set.add(question[token.idx:token.idx+len(token.text)].lower())
            if len(token_prontext_set)==1:
                if "it" in token_prontext_set:
                    self.good_common_question.add(question)
        self.common_candidate_question=self.common_candidate_question-self.good_common_question-self.bad_common_question
        self.bad_common_question.update(self.common_candidate_question)
        self.common_candidate_question = set()
        
    def pronque_nocoref_processing(self):
        self.nocoref_candidate_question = self.pronque_nocoref_set
        self.good_nocoref_question = set()
        self.bad_nocoref_question = set()        
        self.nocoref_question_processing_step1()
        self.nocoref_question_processing_step2()
        self.nocoref_question_processing_step3()
        self.nocoref_question_processing_step4()
        self.nocoref_question_processing_step5()
        self.nocoref_question_processing_step6()

    def nocoref_question_processing_step1(self):
        who_re = re.compile(r"(Who|Whose|Whom) (.*?)(say|says|saying|said)(.*)")

        for question in self.nocoref_candidate_question:
            re_result=who_re.match(question)
            if re_result:
                between_text=who_re.search(question).group(2)
                if len(between_text.strip())==0:
                    self.good_nocoref_question.add(question)
                else:
                    between_text_pos_beg,between_text_pos_end=who_re.match(question).span(2)
                    nlp_infor=self.pron_que_nlp_dict[question]
                    myyoutoken_pronidx_set=set()
                    myyoutoken_prontext_list=[]
                    for token in nlp_infor:
                        if token.tag_ in self.pron_tag_set:
                            if question[token.idx:token.idx+len(token.text)] in self.myyou_words_set:
                                token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                                myyoutoken_pronidx_set.add(token_pronidx_infor)
                                myyoutoken_prontext_list.append(question[token.idx:token.idx+len(token.text)].lower())
                    add_question=True
                    for pronidx_tuple in myyoutoken_pronidx_set:
                        pronidx_beg,pronidx_end=list(map(int, pronidx_tuple.split("_")))
                        if between_text_pos_beg<=pronidx_beg<=pronidx_end<=between_text_pos_end:
                            add_question=False
                    if add_question:
                        self.good_nocoref_question.add(question)

        self.nocoref_candidate_question=self.nocoref_candidate_question-self.good_nocoref_question-self.bad_nocoref_question

    def nocoref_question_processing_step2(self):
        who_words_set = set(["who","whose","whom"])
        he_words_set=set(['he', 'him', 'himself', 'his'])
        she_words_set=set(['she', 'her','hers', 'herself'])
        they_words_set=set(['they','them','theirs','their','themselves'])

        for question in self.nocoref_candidate_question:
            nlp_infor=self.pron_que_nlp_dict[question]
            token_text_set=set([token.text.lower() for token in nlp_infor])

            token_pronidx_set=set()
            token_prontext_set=set()
            for token in nlp_infor:
                if token.tag_ in self.pron_tag_set:
                    token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                    token_pronidx_set.add(token_pronidx_infor)
                    token_prontext_set.add(question[token.idx:token.idx+len(token.text)].lower())

            if token_text_set.intersection(who_words_set):
                if token_prontext_set.intersection(self.myyou_words_set):
                    continue
                else:
                    he_in_tag=False
                    she_in_tag=False
                    they_in_tag=False
                    if token_prontext_set.intersection(he_words_set):
                        he_in_tag=True
                    if token_prontext_set.intersection(she_words_set):
                        she_in_tag=True
                    if token_prontext_set.intersection(they_words_set):
                        they_in_tag=True
                    if int(he_in_tag)+int(she_in_tag)+int(they_in_tag)>1:
                        self.bad_nocoref_question.add(question)
                        continue
                    self.good_nocoref_question.add(question)

        self.nocoref_candidate_question=self.nocoref_candidate_question-self.good_nocoref_question-self.bad_nocoref_question
        
    def nocoref_question_processing_step3(self):
        who_re2 = re.compile(r"(.+?)(\"(.+)\"|''(.+)'')(.*)")
        who_words_set = set(["who","whose","whom"])
        heshethey_words_set=set(['he', 'she', 'they', 'her', 'him', 'them', 'himself',\
                             'hers', 'theirs', 'his', 'their', 'herself', 'themselves'])
        select_ner_set=set(["PERSON","NORP","FAC","ORG","GPE"])

        for question in self.nocoref_candidate_question:
            re_result=who_re2.match(question)
            if re_result:
                nlp_infor=self.pron_que_nlp_dict[question]
                token_text_set=set([token.text.lower() for token in nlp_infor])
                between_text_pos_beg,between_text_pos_end=who_re2.match(question).span(1)
                outside_token_pronidx_set=set()
                outside_token_prontext_set=set()
                for token in nlp_infor:
                    if token.tag_ in self.pron_tag_set:
                        pronidx_beg,pronidx_end=token.idx,token.idx+len(token.text)
                        if pronidx_end<=between_text_pos_end:
                            token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                            outside_token_pronidx_set.add(token_pronidx_infor)
                            outside_token_prontext_set.add(question[token.idx:token.idx+len(token.text)].lower())

                select_ner_idx=[]
                for ent_element in nlp_infor.ents:
                    ent_label=ent_element.label_
                    if ent_label in select_ner_set:
                        if between_text_pos_beg<=ent_element[0].idx<between_text_pos_end:
                            select_ner_idx.append(ent_element[0].idx)

                add_question=True
                for pronidx_tuple in outside_token_pronidx_set:
                    pronidx_beg,pronidx_end=list(map(int, pronidx_tuple.split("_")))
                    prontext=question[pronidx_beg:pronidx_end].lower()
                    if prontext in self.myyou_words_set:
                        add_question=False
                    if not token_text_set.intersection(who_words_set):
                        if prontext in heshethey_words_set:
                            add_question=False

                if add_question:
                    self.good_nocoref_question.add(question)
                else:
                    self.bad_nocoref_question.add(question)

        self.nocoref_candidate_question=self.nocoref_candidate_question-self.good_nocoref_question-self.bad_nocoref_question
        
    def nocoref_question_processing_step4(self):

        for question in self.nocoref_candidate_question:
            nlp_infor=self.pron_que_nlp_dict[question]
            token_pronidx_set=set()
            token_prontext_set=set()
            for token in nlp_infor:
                if token.tag_ in self.pron_tag_set:
                    token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                    token_pronidx_set.add(token_pronidx_infor)
                    token_prontext_set.add(question[token.idx:token.idx+len(token.text)].lower())
            for prontext in token_prontext_set:
                if prontext in self.myyou_words_set:
                    self.bad_nocoref_question.add(question)

        self.nocoref_candidate_question=self.nocoref_candidate_question-self.good_nocoref_question-self.bad_nocoref_question
        
    def nocoref_question_processing_step5(self):
        heshe_words_set=set(['he', 'she', 'they', 'her', 'him', 'them', 'himself',\
                     'hers', 'theirs', 'his', 'their', 'herself', 'themselves'])
        select_ner_set=set(["PERSON","NORP","FAC","ORG","GPE"])

        for question in self.nocoref_candidate_question:
            nlp_infor=self.pron_que_nlp_dict[question]
            select_ner_idx=[]
            for ent_element in nlp_infor.ents:
                ent_label=ent_element.label_
                if ent_label in select_ner_set:
                    select_ner_idx.append(ent_element[0].idx)

            myyou_tag=False
            heshetoken_pronbegidx_set=set()
            heshetoken_prontext_list=[]
            for token in nlp_infor:
                if token.tag_ in self.pron_tag_set:
                    if question[token.idx:token.idx+len(token.text)].lower() in heshe_words_set:
                        heshetoken_pronbegidx_set.add(token.idx)
                        heshetoken_prontext_list.append(question[token.idx:token.idx+len(token.text)].lower())
                    if question[token.idx:token.idx+len(token.text)] in self.myyou_words_set:
                        myyou_tag=True

            add_question=False
            for neridx in select_ner_idx:
                for pronbeg in heshetoken_pronbegidx_set:
                    if neridx<pronbeg:
                        add_question=True
                        break
            if add_question and (not myyou_tag):
                self.good_nocoref_question.add(question)

        self.nocoref_candidate_question=self.nocoref_candidate_question-self.good_nocoref_question-self.bad_nocoref_question
        
    def nocoref_question_processing_step6(self):
        for question in self.nocoref_candidate_question:
            nlp_infor=self.pron_que_nlp_dict[question]
            token_text_set=set([token.text.lower() for token in nlp_infor])

            token_pronidx_set=set()
            token_prontext_set=set()
            for token in nlp_infor:
                if token.tag_ in self.pron_tag_set:
                    token_pronidx_infor=f"{token.idx}_{token.idx+len(token.text)}"
                    token_pronidx_set.add(token_pronidx_infor)
                    token_prontext_set.add(question[token.idx:token.idx+len(token.text)].lower())
            if len(token_prontext_set)==1:
                if "it" in token_prontext_set:
                    self.good_nocoref_question.add(question)
        self.nocoref_candidate_question=self.nocoref_candidate_question-self.good_nocoref_question-self.bad_nocoref_question
        self.bad_nocoref_question.update(self.nocoref_candidate_question)
        self.nocoref_candidate_question = set()

In [4]:
class TempInfor_Transformation_Processing:
    def __init__(self, results_df, paraid_pub_dict, trans_type):
        self.results_df = results_df
        self.paraid_pub_dict = paraid_pub_dict
        self.trans_type = trans_type
    
    def __call__(self):
        self.temp_trans()
        return self.results_df
        
    def temp_trans(self):
        for rowidx, row in self.results_df.iterrows():
            org_question = row["org_question"]
            org_answer = row["org_answer"]
            para_id = row["para_id"]
            pub = paraid_pub_dict[para_id]
            if self.trans_type=="question":
                org_infor = org_question
            if self.trans_type=="answer":
                org_infor = org_answer
            
            ents_list=nlp(org_infor).ents
            ents_label_list=[ent.label_ for ent in ents_list]
            if "DATE" in ents_label_list:
                heideltime_parser.set_document_time(pub)
                temp_results = heideltime_parser.parse(org_infor)
                temp_results = temp_results[temp_results.find("\n<TimeML>\n")+len("\n<TimeML>\n"):].replace("\n</TimeML>\n\n","")
                while re.search(temp_extract_re,temp_results):
                    temp_beg, temp_end = re.search(temp_trans_re,temp_results).span()
                    temp_infor = temp_results[temp_beg:temp_end]
                    temp_value = re.search(temp_extract_re,temp_infor).group(1).strip('"')
                    temp_text = re.search(temp_extract_re,temp_infor).group(2)
                    trans_flag = False
                    if temp_text!=temp_value:
                        if re.fullmatch(temp_day_form,temp_value):
                            year_v,month_v,day_v=temp_value.split("-")
                            transformed_temp_value=f"{idx2month[month_v]} {day_v}, {year_v}"
                            trans_flag = True
                        elif re.fullmatch(temp_month_form,temp_value):
                            year_v,month_v=temp_value.split("-")
                            transformed_temp_value=f"{idx2month[month_v]}, {year_v}"
                            trans_flag = True
                        elif re.fullmatch(temp_year_form,temp_value):
                            temp_type="year"
                            year_v=temp_value
                            transformed_temp_value=year_v
                            trans_flag = True
                        else:
                            continue
                    else:
                        break
                    if trans_flag:
                        trans_text = temp_results[:temp_beg]+transformed_temp_value+temp_results[temp_end:]
                        temp_results = trans_text
                        if self.trans_type=="question":
                            self.results_df.iloc[rowidx]["org_question"]=trans_text
                            self.results_df.iloc[rowidx]["trans_que"]="1"
                        if self.trans_type=="answer":
                            self.results_df.iloc[rowidx]["org_answer"]=trans_text
                            self.results_df.iloc[rowidx]["trans_ans"]="1"
                        self.results_df.iloc[rowidx][f"org_{self.trans_type}"]=trans_text

In [5]:
example=pickle.load(open("data/examples.pickle", "rb"))
paraid_pub_dict = dict()
for r in example:
    paraid_pub_dict[r[0]] = r[1][:4]+"-"+r[1][4:6]+"-"+r[1][6:8]
    
raw_results_df=feather.read_feather("data/raw_results_After_2ndModule.feather")
print(len(raw_results_df))

90


In [6]:
min_ent_num,max_ent_num = 1,7
min_token_num,max_token_num = 8,30
ST_Processing = Syntactic_Temporal_Processing(min_ent_num, max_ent_num,\
                                              min_token_num, max_token_num, paraid_pub_dict)

In [7]:
basic_filtered_results_df = ST_Processing(raw_results_df)
print(len(basic_filtered_results_df))

76


In [8]:
basic_filtered_results_df.to_feather("data/raw_results_After_3rdModule.feather")

In [None]:
#basic_filtered_results_df.head(5)

In [None]:
#basic_filtered_results_df[basic_filtered_results_df["trans_que"]=="1"][["org_question","trans_question"]].values

In [None]:
#basic_filtered_results_df[basic_filtered_results_df["trans_ans"]=="1"][["org_answer","trans_answer"]].head()