# Syntactic & Temporal Filtering/Transforming Module

In [1]:
import re
import pickle
import pandas as pd
import pyarrow.feather as feather
import spacy
nlp = spacy.load("en_core_web_sm")
import neuralcoref
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7fd99c4cd950>

In [2]:
class Syntactic_Temporal_Processing:
    def __init__(
        self,
        min_ent_num: int, 
        max_ent_num: int, 
        min_token_num: int, 
        max_token_num: int, 
    ):
        self.min_ent_num = min_ent_num
        self.max_ent_num = max_ent_num
        self.min_token_num = min_token_num
        self.max_token_num = max_token_num
        
    def __call__(self, raw_results_df):
        self.results_df = raw_results_df
        self.remove_que_without_qmark()
        self.remove_que_ansinquestion()
        self.remove_que_duplicate()
        self.add_nlp_column()
        self.remove_que_entissue()
        self.remove_que_tokissue()
        return self.results_df
    
    def add_nlp_column(self):
        self.results_df['que_nlp'] = self.results_df['question'].apply(lambda x: nlp(x))
        
    def remove_que_without_qmark(self):
        #1. Remove questions that do not end with a question mark.
        self.results_df = self.results_df[self.results_df['question'].str.endswith('?')]
        self.results_df = self.results_df.reset_index(drop=True)
        
    def remove_que_ansinquestion(self):
        #2. Remove questions whose answers are explicitly indicated inside the questions’ content.
        self.results_df = self.results_df[self.results_df.apply(lambda x: x["answer"].lower() not in x["question"].lower(), axis=1)]
        self.results_df = self.results_df.reset_index(drop=True)
        
    def remove_que_duplicate(self):
        #3. Remove duplicate questions.
        self.results_df = self.results_df[~self.results_df.duplicated('question',keep=False)]
        self.results_df = self.results_df.reset_index(drop=True)
        
    def remove_que_entissue(self):
        #4. Remove questions that have too few or too many named entities.
        removed_index = []
        for row_i,row in self.results_df.iterrows():
            if min_ent_num<=len(row['que_nlp'].ents)<=max_ent_num:
                continue
            else:
                removed_index.append(row_i)
        self.results_df = self.results_df.drop(removed_index).reset_index(drop=True)
    
    def remove_que_tokissue(self):
        #5. Remove questions that are too short or too long.
        removed_index = []
        for row_i,row in self.results_df.iterrows():
            if min_token_num<=len(row['que_nlp'])<=max_token_num:
                continue
            else:
                removed_index.append(row_i)
        self.results_df = self.results_df.drop(removed_index).reset_index(drop=True)
    
    def remove_que_pronissue(self):
        #6. Remove questions with unclear pronouns
        pass #(TBD)
    
    def transform_que_temp(self):
        #7. Transform relative temporal information in questions to absolute temporal information.
        pass #(TBD)
    
    def transform_ans_temp(self):
        #8. Transform relative temporal information of the answers of generated questions to absolute temporal information.
        pass #(TBD)

In [3]:
example=pickle.load(open("data/examples.pickle", "rb"))

raw_results_df=feather.read_feather("data/raw_results_After_2ndModule.feather")
print(len(raw_results_df))

90


In [4]:
min_ent_num,max_ent_num = 1,7
min_token_num,max_token_num = 8,30
ST_Processing = Syntactic_Temporal_Processing(min_ent_num,max_ent_num,min_token_num,max_token_num)

In [5]:
basic_filtered_results_df = ST_Processing(raw_results_df)
print(len(basic_filtered_results_df))

76


In [6]:
basic_filtered_results_df["org_answer"] = basic_filtered_results_df["answer"]
basic_filtered_results_df["trans_que"] = ""
basic_filtered_results_df["trans_ans"] = ""

In [7]:
basic_filtered_results_df = basic_filtered_results_df[["question","answer","org_answer","ans_pos","ans-sent_pos","para_id","trans_que","trans_ans"]]
basic_filtered_results_df.to_feather("data/raw_results_After_3rdModule.feather")