# Assignment 04: Question Answering

## Task 01: Answer's Sentnece Detector

### Loading required packages and Get Access to Google Drive

get access to Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


* unzip data
(uncomment below code if you run the code for the firts time)

In [None]:
copied_data_path = 'drive/MyDrive/squadv1.zip' 
# !unzip copied_data_path -d 'drive/MyDrive/Colab Notebooks/data'

Using Stanford [stanza](https://stanfordnlp.github.io/stanza/)   for POS, NER and Dependency Parsing in section one of the assignment, as it's claimed it's been implemented by the state-of-the-art NLP models 


*   Installing required packages

In [None]:
# Install stanza
!pip install stanza
# install sentence-transformer for vectorizing sentences
!pip install -U sentence-transformers

# install pickle to save objects's files
! pip install pickle5


*   Setting up Stanford CoreNLP 

In [4]:
# Import stanza
import stanza
# Download the Stanford CoreNLP package with Stanza's installation command
# This'll take several minutes, depending on the network speed
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

2021-01-15 17:12:20 INFO: Installing CoreNLP package into ./corenlp...
Downloading http://nlp.stanford.edu/software/stanford-corenlp-latest.zip: 100%|██████████| 505M/505M [03:52<00:00, 2.17MB/s]


### Constants definition
this section includes constants variables such as training data path or importants tags which are used in POS tagging or shuch as these variables

In [5]:
data_path = 'drive/MyDrive/Colab Notebooks/data/squadv1/'
train_file = 'train-v1.1.json'
eval_file = 'dev-v1.1.json'
documents_path = 'drive/MyDrive/Colab Notebooks/documents/'


# set of pos tags which are more important in query
POS_tags = {'NN', 'NNS', 'NNP', 'NNPS',         # nouns
            'JJ', 'JJR', 'JJS',                 # adjectives
            'VB','VBS','VBG','VBN','VBP','VBZ', # verbs
            'WP','WP','WP$','WRB',              # WH terms
            'CD'                                # numbers
            }

# use this model for vectorizing sentences in SentenceTransformer
sentence_vectorizing_model = 'stsb-distilbert-base'

# WH words which we want to detect in questions
# also assign an id to each of them to be able to determin them later
wh_terms = {'why':0, 'where':1 ,'when':2 ,'what':3 ,'who':4}

# key of linguistic features 
# with their id (use it as a reffering index in represented list) 
# 0: locations 1: date/time 2: other name enteties
linguistic_features = {'CITY': 0, 'COUNTRY': 0, 'LOCATION': 0,  'ORGANIZATION': 0,
                       'DATE': 1, 'TIME': 1,
                       'RELIGION': 2, 'PERSON': 2,  'TITLE': 2,  'IDEOLOGY': 2, 'NATIONALITY': 2, 'STATE_OR_PROVINCE': 2, 'CAUSE_OF_DEATH': 2,  'CRIMINAL_CHARGE': 2}



### Answer_Sentnce_Detector class implementation



*   import requirements



In [6]:
import json
import pickle5 as pickle

from stanza.server import CoreNLPClient
from sentence_transformers import SentenceTransformer
import torch
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

from numpy import linalg as LA
import numpy as np




*   for more details on stanza NLP Models check [here](https://colab.research.google.com/github/stanfordnlp/stanza/blob/master/demo/Stanza_CoreNLP_Interface.ipynb#scrollTo=lIO4B5d6Rk4I)
*   for more details on DistilBERT model used for vectorizing used sentence-transformer represented by [UKPLab](https://github.com/UKPLab/sentence-transformers)
used 'stsb-distilbert-base' model as it's based on a transformer model and it's been one of the best model considering performance and speed based on the  [given results](https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/edit#gid=0)
* in root extraction used lemmanizer instead of stemmer as it performe bether



In [None]:
class Answer_Sentence_Detector():
  """ a class to detect answer sentence among all sentence of a paragraph using sentence-question features
  """
  def __init__(self,train_data_path,k):
      with open(train_data_path, 'r',encoding='utf8') as train_file:
        # json file is a dictionary with keys 'data' and 'version'
        data = json.load(train_file)['data']

        # a dictionary that wach key is a doc_id and it's value is doc paraghraph text
        self.paragraph_dict = dict()
        # a dictionary that each key has a list ,which are in the same order,
        # including question's data and answer and question's paraghraph id in order
        self.question_answer_dict = {'question':[],
                                      'answer':[],
                                      'paragraph_id':[]
                                     }
        # use it for indexing paraghraphs
        doc_id =0

        for title_docs in data:
          for doc in title_docs['paragraphs']:
            self.paragraph_dict.update({doc_id:doc['context']})
            for qas in doc['qas']:
              for answer in qas['answers']:
                self.question_answer_dict['question'].append(qas['question'])
                # store answer_start too, as we need the start index to find the sentence that asnwer is in
                self.question_answer_dict['answer'].append(answer)
                self.question_answer_dict['paragraph_id'].append(doc_id)
            doc_id+=1
      # call preprocessor
      self.preprocess(k)

  def preprocess(self,k):

    print('start peprocessor')

    # ######## remove paraghraphs and related property which has more than k sentnces ########

    # the index of current doc index in the paraghraph's id array
    doc_id_index =0
    # the index of current doc
    doc_id =0

    while doc_id < len(self.paragraph_dict): # for each paragraph
      # some sentences starts with \n, remove all \n as we use it to seperate sentences and these are redundant
      self.paragraph_dict[doc_id] = self.paragraph_dict[doc_id].replace('\n','') 

      # get number of sentence each doc has
      doc_sentences_len = len(self.sent_tokenize((self.paragraph_dict[doc_id])))
      if doc_sentences_len > k:   # if the current doc's sentence is more than k
        # then remove it from self.doc_dict and also 
        # remove related questions and answers from self.question_answer_dict

        # remove paraghraph 
        self.paragraph_dict.pop(doc_id)

        # remove related questions and answers
        while self.question_answer_dict['paragraph_id'][doc_id_index]<=doc_id:
          if self.question_answer_dict['paragraph_id'][doc_id_index]<doc_id:
            doc_id_index+=1
          else:
            self.question_answer_dict['question'].pop(doc_id_index)
            self.question_answer_dict['answer'].pop(doc_id_index)
            self.question_answer_dict['paragraph_id'].pop(doc_id_index)
      
      doc_id+=1
    
    # call self_.POS_tagger for all questions and return refrmed ones
    # (it's been explain in the function the way it changes them)

    # replace questions with refrmed ones
    self.question_answer_dict['question'] = self.POS_tagger(self.question_answer_dict['question'])
    print('finished preproccessor')
  
  def POS_tagger(self, questions_list):
    edited_questions = list()
    # define CoreNLPClient based on our usaage, 
    # and use it to select those words with tags in POS_tags 
    with CoreNLPClient(
          annotators=['pos','lemma'],
          timeout=30000,
          memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:
          
          for index,question in enumerate(questions_list):
            nn = client.annotate(question)
            new_question = ''
            old_question = ''
            for sentence in nn.sentence:
              for token in sentence.token:
                # print('{:12s}\t{:6s}'.format(token.value, token.pos),end='\t')
                if token.pos in POS_tags:
                  # print(True,end='')
                  if token.pos =='NNP' or token.pos =='NNPS':
                    # add lemmanized term to new_question
                    new_question += token.lemma + ' '
                  old_question += token.lemma + ' '

            # if this question doesn't contain that much token 
            # of speccified POS tags, do not change it
            if len(new_question.split()) > 1:
              edited_questions.append(new_question)
            else: 
              edited_questions.append(old_question)
            
            if index % 10000 == 0 and index!= 0:
              print('*POS_tagger* processed {:16d}\tover\t{:16d} qustions'.format(index, len(questions_list)))
    return edited_questions

  def sent_tokenize(self, paragraph):
    # a list of all sentnece in the given praghraph
    sentence_list = list()
    
    # add '\n' at the end of each sentence   
    reformed_paragraph = paragraph.replace('. ','. \n').replace('? ','? \n')

    # then split by '\n' and add each sentence to the list
    for sentence in reformed_paragraph.split('\n'):
      if len(sentence) > 0:
        sentence_list.append(sentence)

    return sentence_list  # return a list of given paraghraph's sentences

  def sent_vectorize(self, paragraph_dict, question_answer_dict, train= True):
    """ 
      vectorize data for train, evaluation and querys 
      
      @param  paragraph_dict(dictionary): a dictionary like {'paragraph_id': 'paragraph',...}
      @param  question_answer_dict(dictionary): a dictionary like {'question':['q1',...] ,...
                                                                    'answer': ['a1', ...], ...
                                                                    'paragraph_id': ['id1', ...]
                                                                    }
                                                                    
    """
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('current available device is "{}" for vectorizing'.format(device))
    vectorizing_model = SentenceTransformer(sentence_vectorizing_model, device=device)

    sentences_vec = dict()
    # use index to show how processed/all data
    index =0
    for doc_id, doc in zip(paragraph_dict.keys(), paragraph_dict.values()):

      sentences_list = self.sent_tokenize(doc)
      sentences_vec.update({doc_id:vectorizing_model.encode(sentences_list)})
      if index %1000 ==0 and index != 0:
        print('* vectorizing sentence * {:10s}:{:12d}  {:10s}  {:12d}'.format('processed',index,'over',len(paragraph_dict)))
      index+=1
    
    questions_vec = list()

    # use index to show how processed/all data
    index = 0
    for question in question_answer_dict['question']:
      questions_vec.append(vectorizing_model.encode(question))

      if index %100000 ==0 and index != 0:
        print('* vectorizing question * {:10s}:{:12d}  {:10s}  {:12d}'.format('processed',index,'over',len(question_answer_dict['question'])))
      index+=1
    
    if train: # if it's train data, save vectorized data
      with open(documents_path + 'vectorized_train_data.pickle','wb') as vectorized_file:
        pickle.dump({'doc_vec':sentences_vec,'question_vec':np.array(questions_vec)}, vectorized_file, protocol= pickle.HIGHEST_PROTOCOL)

    # return vectorized data
    return {'doc_vec':sentences_vec,'question_vec':np.array(questions_vec)}
  
  def calculate_vector_distances(self, type, vectorized_data ,question_answer_dict, train= True ):
    """
      @param  type(int): valid value for it is 1,2, or 3 which shows which type of distance calculator 
                        is going to be used
                        type = 1 : cosine similarity
                        type = 2 : Euclidean distance
                        type = 3 : dot product    
      @param  vectorized_data(dict): vectorized_data's structure is like: 
                                    {
                                      'doc_vec': {
                                                  'doc_id': sentences_vec,..
                                                  }
                                      'question_vec': questions_vec
                                    }
    @return distance_dict(dict): distance_dict's structure is like:
      {'par_id':[
                  ['first_question/first_sentence distance', 'first_question/second_sentence distance', ...]
                  ['second_question/first_sentence distance', 'second_question/second_sentence distance', ...]
                  ,...
                ]
        ,...
      }

      """

    # a variable to store distances 
    distance_dict = dict()
    # size of all questions
    questions_list_size = len(question_answer_dict['question']) 

    for index in range(questions_list_size):
      # extract vector for this index's question
      question_vec = vectorized_data['question_vec'][index]

      # extract doc_id for this question
      doc_id = question_answer_dict['paragraph_id'][index]

      if doc_id not in distance_dict:
        # add this doc to dictionary
        distance_dict.update({doc_id:[]})

      # a list of distance for this question and each sentence of it's doc(paragraph)
      question_dictance = list()

      # calculate distance based one the asked method
      if type == 1:   # cosine similarity
        question_distance = [np.dot(sentence_vec,question_vec)/
                            (LA.norm(sentence_vec) * LA.norm(question_vec))
                              for sentence_vec in vectorized_data['doc_vec'][doc_id]]
      elif type == 2:   # Euclidean distnace
        question_distance = [LA.norm(question_vec - sentence_vec)
                              for sentence_vec in vectorized_data['doc_vec'][doc_id]]
      elif type == 3:   # dot product
        question_distance = [np.dot(sentence_vec,question_vec)
                              for sentence_vec in vectorized_data['doc_vec'][doc_id]]
      
      # add distance to dictionary
      distance_dict[doc_id].append(question_distance)

      if index % 100000 == 0 and index != 0:
        print('* calculate vector distance * {:10s}:{:12d}  {:10s}  {:12d}'.format('processed',index,'over',questions_list_size))
    
    if train: # save data if it's called for training set
      with open(documents_path+ 'distances_train_{}.pickle'.format(type), 'wb') as distances_file:
       pickle.dump(distance_dict, distances_file, protocol= pickle.HIGHEST_PROTOCOL)

    return distance_dict

  def compare_roots(self, paragraph_dict, question_answer_dict, train= True):

    """
        compared_roots_dict's structure is like:
        {'par_id':[
                    ['first_question/first_sentence root_sim', 'first_question/second_sentence root_sim', ...]
                    ['second_question/first_sentence root_sim', 'second_question/second_sentence root_sim', ...]
                    ,...
                  ]
          ,...
        }
    """
    compared_roots_dict= dict()

    with CoreNLPClient(
    annotators=['tokenize','ssplit','pos','depparse','lemma']
    , timeout=60000, memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:
      # submit the request to the server

      question_index = 0
      questions_size = len(question_answer_dict['question'])
      # for each paragraph
      for paragraph_id in paragraph_dict.keys():
        # add this paragraph_id to compared_roots_dict
        compared_roots_dict.update({paragraph_id: list() })
        
        # use self.sent_tokenize to get list of this sentence paragraph
        sentences_list = self.sent_tokenize(paragraph_dict[paragraph_id])
        
        # now for eahc question that is for this paragrpah 
        while question_index <questions_size and question_answer_dict['paragraph_id'][question_index] == paragraph_id:
          # add a list to stor this root-comparision for this question an each sentence of the pragraph
          compared_roots_dict[paragraph_id].append(list())

          # get question text
          question = question_answer_dict['question'][question_index]
          # construct an annotate object for question text
          question_ann = client.annotate(question)
          # get the first sentence for question (as it contains only one question)
          que = question_ann.sentence[0]
          # get the root of the dependency parse for question 
          question_root = que.basicDependencies.root[0]
          # get roots lemma
          question_root_lemma = que.token[question_root-1].lemma

          for sentence in sentences_list: # for each sentence in sentence list
            sentence_ann = client.annotate(sentence)
            # same_root defaults value is false, and turn it to true when similar root's been seen
            same_root =False
            for sen in sentence_ann.sentence:
              # get the root of the dependency parse for sentence
              sentence_root = sen.basicDependencies.root[0]
              # get roots lemma
              sentence_root_lemma = sen.token[sentence_root-1].lemma
              # compare roots
              if question_root_lemma == sentence_root_lemma:
                same_root = True
                break
            
            # now save 1 if the question ans sentence both have same root 
            if same_root:
              compared_roots_dict[paragraph_id][-1].append(1)
            else:
              compared_roots_dict[paragraph_id][-1].append(0)


          if question_index % 10000 == 0 and question_index != 0:
            print('* root comprision * {:10s}:{:12d}  {:10s}  {:12d}'.format('processed',question_index,'over',questions_size))
          question_index += 1                

    if train: # save file if it's called for training set
      with open(documents_path+ 'train_compared_roots.pickle', 'wb') as compared_roots_file:
        pickle.dump(compared_roots_dict, compared_roots_file, protocol= pickle.HIGHEST_PROTOCOL)
    # return result as a dictiounary
    return compared_roots_dict
      
  def check_wh_presence(self,question_answer_dict, train= True):
    # a list to store wh presence
    wh_feature_list = list()

    for index, question in enumerate(question_answer_dict['question']):
      # make a list of size the wh features in the wh_terms defined in the constants cell
      wh_feature = [0 for i in range(len(wh_terms))]
      # split questions to list of tokens
      question_list =  question.split()
      # for each token in question
      for token in question_list:
        # check if it's a wh term 
        if token in wh_terms:
          # if it is, change this wh term index to 1
          wh_feature[wh_terms[token]] = 1
      #add wh_feature list for this question to list of all wh_features    
      wh_feature_list.append(wh_feature)

      if index % 10000 == 0 and index != 0:
          print('* root comprision * {:10s}:{:12d}  {:10s}  {:12d}'.format('processed',index,'over',len(question_answer_dict['question'])))

    if train:    # save data if it's called for training set  
      with open(documents_path + 'train_wh_feature.pickle' , 'wb') as wh_feature_file:
        pickle.dump(wh_feature_list, wh_feature_file, protocol= pickle.HIGHEST_PROTOCOL)

    return wh_feature_list
      
  def extract_linguistic_features(self,paragraph_dict, question_answer_dict, train= True):

    features_num = np.unique(np.array(list(linguistic_features.values()))).shape[0]
    
    linguistic_feature_dict = {'paragraph':dict(),
                          'question': []}

    with CoreNLPClient(
        annotators=['ner'],
        timeout=30000,
        memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:

        index = 0
        # first calculate for sentences in paragraphs
        for paragraph_id, paragraph in zip(paragraph_dict.keys(),paragraph_dict.values()):
          # first add this paragraph id to linguistic_feature_dict
          linguistic_feature_dict['paragraph'].update({paragraph_id:list()})

          # now cal linguistic feature for each sentence of it 
          
          sentences_list = self.sent_tokenize(paragraph)

          for sentence in sentences_list:
            
            # first construct a list that each of it's cell represent for one 
            # of the aske NER (based on the id that's been given to them in constants cell ) 
            ling_feature_list = [0 for i in range(features_num)]

            ann=client.annotate(sentence)

            for i,sen in enumerate(ann.sentence):
              for term in sen.token:
                ner = term.ner
                if ner in linguistic_features.keys():
                  ling_feature_list[linguistic_features[ner]] += 1
            
            # add this sentence liguistic feature
            linguistic_feature_dict['paragraph'][paragraph_id].append(ling_feature_list)
          
          if index %1000 ==0 and index!= 0:
            print('* linguistic sentence * {:10s}:{:12d}  {:10s}  {:12d}'.format('processed',index,'over',len(paragraph_dict)))
          index += 1

        index = 0
        for question in question_answer_dict['question']:

          # first construct a list that each of it's cell represent for one 
          # of the aske NER (based on the id that's been given to them in constants cell ) 
          ling_feature_list = [0 for i in range(features_num)]

          ann=client.annotate(question)

          for i,sen in enumerate(ann.sentence):
            for term in sen.token:
              ner = term.ner
              if ner in linguistic_features.keys():
                ling_feature_list[linguistic_features[ner]] += 1
          
          # add this sentence liguistic feature
          linguistic_feature_dict['question'].append(ling_feature_list)
          if index %10000 ==0 and index !=0 :
            print('* linguistic question * {:10s}:{:12d}  {:10s}  {:12d}'.format('processed',index,'over',len(question_answer_dict['question'])))
          index += 1

    if train: 
      with open(documents_path + 'train_linguistic_feature.pickle', 'wb') as linguistic_feature_file:
        pickle.dump(linguistic_feature_dict, linguistic_feature_file, protocol= pickle.HIGHEST_PROTOCOL)
      
    return linguistic_feature_dict

  def transform_data(self, sentence_linguistic_feature= None, 
                     cosine_similarity_dict = None, euclidean_distance_dict = None,
                     dot_product_dict = None, root_comparision_dict = None,
                     wh_presence_list = None, question_linguistic_feature = None, train= True):
    """
      for evaluation,questy parts get these variables, for train, load them

      we vectorize every extracted features, which would be like dit for every 
      pair of sentence,question
      [<sentence_linguistic_feature>, <cosine similarity>, <Euclidean distance>,
      <dot product>, <root comparision>, <wh_presence>, <question_linguistic_feature>
      ]

      and store it like below:
      transformed_data = {<paragraph_id>: [
                                            [ <first question,first sentence features>,
                                              <first question,second sentence features>,...
                                            ],
                                            [ <second question,first sentence features>,
                                              <second question,second sentence features>,...
                                            ], ...
                                          ],...
                          }
      each mentioned feature format:
      <sentence_linguistic_feature>: a list of size 3 
      <cosine similarity>: a float number
      <Euclidean distance>: a float number
      <dot product>: a float number
      <root comparision>: an int (0 or 1)
      <wh_presence>: a list of size 5 
      <questoin_linguistic_feature>: a list of size 3  

      so for just putting these all together a list of size 15 is required for each
      sentence,question pairs
      
    """
    transformed_data = dict()
    if train:
      # vars to load fetures 
      sentence_linguistic_feature= dict()
      cosine_similarity_dict = dict()
      euclidean_distance_dict = dict()
      dot_product_dict = dict()
      root_comparision_dict = dict()
      wh_presence_list = list()
      question_linguistic_feature = list()
    
      # first open every file in order to organize fetures
      try: # try to opnen distances file
        with open(documents_path + 'distances_train_1.pickle', 'rb') as cosine_similarity_file:
          cosine_similarity_dict = pickle.load(cosine_similarity_file)
      
        with open(documents_path + 'distances_train_2.pickle', 'rb') as euclidean_distnace_file:
          euclidean_distance_dict = pickle.load(euclidean_distnace_file)

        with open(documents_path + 'distances_train_3.pickle', 'rb') as dot_product_file:
          dot_product_dict = pickle.load(dot_product_file)
      except IOError: # calculate distances file
        print('*'*8, 'some distances files for training do not exist, calculating them might take a while','*'*8)

        try: # try to open vectorized data
          vectorized_data = dict()
          with open(documents_path + 'vectorized_train_data.pickle', 'rb') as vectorized_data_file:
            vectorized_data = pickle.load(vectorized_data_file)
        except IOError:
          print('*'*8, 'vectorized data file does not exist, preparing it might take a while','*'*8)
          vectorized_data = self.sent_vectorize(self.paragraph_dict, self.question_answer_dict)
        # calculate distances
        cosine_similarity_dict = self.calculate_vector_distances(1, vectorized_data, self.question_answer_dict)
        euclidean_distance_dict = self.calculate_vector_distances(2, vectorized_data, self.question_answer_dict)
        dot_product_dict = self.calculate_vector_distances(3, vectorized_data, self.question_answer_dict)

      try: # try to open compared root file
        with open(documents_path +'train_compared_roots.pickle', 'rb') as root_comparision_file:
          root_comparision_dict = pickle.load(root_comparision_file)
      except IOError:
        print('*'*8,'roots comparision file for training does not exist, preparing it might take a while','*'*8)
        root_comparision_dict= self.compare_roots(self.paragraph_dict, self.question_answer_dict)
      
      try:  # try to open wh features file
        with open(documents_path + 'train_wh_feature.pickle', 'rb') as wh_feature_file:
          wh_presence_list = pickle.load(wh_feature_file)
      except IOError:
        print('*'*8, 'wh_feature file for training does not exist, preparing it might take a while','*'*8)
        wh_presence_list = self.check_wh_presence(self.question_answer_dict)
      
      try: # try to open linguistic feature file
        with open(documents_path + 'train_linguistic_feature.pickle', 'rb') as linguistic_feature_file:
          linguistic_feature = pickle.load(linguistic_feature_file)
      except IOError:
        print('*'*8, 'linguistic features file for training does not exist, preparing it might take a while','*'*8)
        linguistic_feature = self.extract_linguistic_features(self.paragraph_dict, self.question_answer_dict)
      sentence_linguistic_feature,question_linguistic_feature =linguistic_feature['paragraph'],linguistic_feature['question']
      
    # to sumup questions appeared by now
    questions_index = 0
    # as cosine_similarity_dict structure is like what we want for transformed_data
    # use it to gather all data
    for paragraph_id in cosine_similarity_dict.keys():
      
      transformed_data.update({paragraph_id: list() })
      
      for question_index in range(len(cosine_similarity_dict[paragraph_id])):
        
        # define a list containing all sentence/question pairs for one paragraph and question
        transformed_data[paragraph_id].append( list() )
        # paragraph_question_feature = list()

        for sentence_index in range(len(cosine_similarity_dict[paragraph_id][question_index])):
          
          # paragraph_question_feature.append(list())
          transformed_data[paragraph_id][-1].append( list() )

          # 1: add sentence_linguistic_feature
          transformed_data[paragraph_id][-1][-1] += sentence_linguistic_feature[paragraph_id][sentence_index]

          # 2: add cosine similarity feature
          transformed_data[paragraph_id][-1][-1].append(cosine_similarity_dict[paragraph_id][question_index][sentence_index])

          # 3: add Euclidean distance feature
          transformed_data[paragraph_id][-1][-1].append(euclidean_distance_dict[paragraph_id][question_index][sentence_index])

          # 4: add dot product feature
          transformed_data[paragraph_id][-1][-1].append(dot_product_dict[paragraph_id][question_index][sentence_index])

          # 5: add root comparision feature 
          transformed_data[paragraph_id][-1][-1].append(root_comparision_dict[paragraph_id][question_index][sentence_index])

          # 6: add wh presence feature
          transformed_data[paragraph_id][-1][-1] += wh_presence_list[questions_index] 

          # 7: add question_linguistic_feature
          transformed_data[paragraph_id][-1][-1] += question_linguistic_feature[questions_index]
        
        
        questions_index +=1

    if train:
      with open(documents_path + 'train_transformed_data.pickle', 'wb') as transformed_data_file:
        pickle.dump(transformed_data, transformed_data_file, protocol = pickle.HIGHEST_PROTOCOL)

    return transformed_data
  
  def extract_label(self, paragraph_dict, question_answer_dict):
    """
      @return label_dict(dict): -	{<paragraph_id>: [
                                                    [ <first answer, first sentence label>,
                                                      <first answer, second sentence label>, ...
                                                    ],
                                                    [ <second answer, first sentence label>,
                                                      <second answer, second sentence label>, ...
                                                      ], ...
                                                    ],...
                                  	}
 
    """
    answer_index = 0
    answers_size = len(question_answer_dict['answer'])
    label_dict = dict()
    for paragraph_id in paragraph_dict.keys():

      label_dict.update({paragraph_id: list()})
      sentences_list = self.sent_tokenize(paragraph_dict[paragraph_id])
      
      while answer_index < answers_size and question_answer_dict['paragraph_id'][answer_index] == paragraph_id:
      
        label_dict[paragraph_id].append(list())
        answer_start_index = question_answer_dict['answer'][answer_index]['answer_start']
        char_index = 0
        for sentence in sentences_list:
          if char_index < answer_start_index <= char_index + len(sentence):
            label_dict[paragraph_id][-1].append(1)
          else:
            label_dict[paragraph_id][-1].append(0)
          
          char_index += len(sentence)
      
        if answer_index % 10000 ==0:
          print('* transform data * {:10s}:{:12d}  {:10s}  {:12d}'.format('processed',answer_index,'over',answers_size))
        answer_index += 1

    return label_dict 
      
  def prepare_data_for_classifier(self, transformed_data, label):

    x_vec = list()
    y_vec = list()
    for paragraph_id in transformed_data.keys():
      for question_id in range(len(transformed_data[paragraph_id])): 
        for sentence_id in range(len(transformed_data[paragraph_id][question_id])):
          # add sentence-question paird feature vector to x_vec
          x_vec.append(transformed_data[paragraph_id][question_id][sentence_id])
          # add label for this sentence-question pair vector to y_vec 
          y_vec.append(label[paragraph_id][question_id][sentence_id])

    # as our model get np.array as input, make these list object a np.array object
    x_vec = np.array(x_vec)
    y_vec = np.array(y_vec)

    return x_vec, y_vec # rturn np.arrays as vectors 

  def fit(self, transformed_data_dict):
    
    label_dict = self.extract_label(self.paragraph_dict, self.question_answer_dict)
    # print(label_dict.keys())
    X, Y = self.prepare_data_for_classifier(transformed_data_dict, label_dict)
    

    # create an object from sklearn GaussinaNB for classification tast
    clf = GaussianNB()
    clf.fit(X,Y)

    # now store model to use it later in other parts
    with open(documents_path + 'GNB_model.pickle', 'wb') as model_file:
      pickle.dump(clf, model_file, protocol= pickle.HIGHEST_PROTOCOL)
    
    print('saved model')
    return clf
    
  def evaluate(self, evaluation_json_path):

    # a dictionary that wach key is a doc_id and it's value is doc paraghraph text
    paragraph_dict = dict()
    # a dictionary that each key has a list ,which are in the same order,
    # including question's data and answer and question's paraghraph id in order
    question_answer_dict = {'question':[],
                                'answer':[],
                                'paragraph_id':[]
                                  }
    with open(evaluation_json_path, 'r', encoding= 'utf8') as eval_file:
      eval_data = json.load(eval_file)['data']

      # use it for indexing paraghraphs
      doc_id =0

      for title_docs in eval_data:
        for doc in title_docs['paragraphs']:
          paragraph_dict.update({doc_id:doc['context']})
          for qas in doc['qas']:
            for answer in qas['answers']:
              question_answer_dict['question'].append(qas['question'])
              # store answer_start too, as we need the start index to find the sentence that asnwer is in
              question_answer_dict['answer'].append(answer)
              question_answer_dict['paragraph_id'].append(doc_id)
          doc_id+=1

    # a dictionary to store transformed_data
    transformed_data = dict()
    try:  # try to load transformed_data file
      with open(documents_path + 'eval_transformed_data.pickle', 'rb') as transformed_data_file:
        transformed_data = pickle.load(transformed_data_file)
    except IOError:
      print('*'*8,'required file does not exist, preparing them might take a while','*'*8)

      # replace questions with refrmed ones
      question_answer_dict['question'] = self.POS_tagger(question_answer_dict['question'])
      # vectorize questions and paragraphs sentences
      vectorized_data = self.sent_vectorize(paragraph_dict, question_answer_dict, train= False)

      print('----------------   finished vectorizing    ----------------')
      # calculate distances
      cosine_similarity  = self.calculate_vector_distances(1, vectorized_data, question_answer_dict, train= False)
      euclidean_distance = self.calculate_vector_distances(2, vectorized_data, question_answer_dict, train= False) 
      dot_product = self.calculate_vector_distances(3, vectorized_data, question_answer_dict, train= False)
      print('----------------   finished distances    ----------------')
      # compare roots 
      compared_roots_data = self.compare_roots(paragraph_dict, question_answer_dict, train= False)
      print('----------------   finished roots comparision    ----------------')
      # wh features
      wh_feature_list = self.check_wh_presence(question_answer_dict, train= False)
      print('----------------   finished wh feature    ----------------')

      # extract linguistic feature
      linguistic_feature = self.extract_linguistic_features( paragraph_dict, question_answer_dict, train= False)
      print('----------------   finished linguisitc extraction    ----------------')

      transformed_data = self.transform_data(sentence_linguistic_feature= linguistic_feature['paragraph'], 
                      cosine_similarity_dict = cosine_similarity, euclidean_distance_dict = euclidean_distance,
                      dot_product_dict = dot_product, root_comparision_dict = compared_roots_data,
                      wh_presence_list = wh_feature_list, question_linguistic_feature = linguistic_feature['question'], train= False)

      print('----------------   finished trnsforming data    ----------------')
      

      with open(documents_path + 'eval_transformed_data.pickle', 'wb') as transformed_data_file:
        pickle.dump(transformed_data, transformed_data_file, protocol= pickle.HIGHEST_PROTOCOL)

    labels = self.extract_label(paragraph_dict, question_answer_dict)
    try: # try to load model
      with open(documents_path + 'GNB_model.pickle', 'rb') as model_file:
        model = pickle.load(model_file)

        x_eval, y_eval = self.prepare_data_for_classifier(transformed_data, labels)
        y_pred = model.predict(np.array(x_eval))

        score = accuracy_score(y_eval, y_pred)
        print(score)
        return score
    except IOError:
      print('*'*8, 'model has not been trained, training process might take a while', '*'*8)
      try:
        train_transformed_data= dict()
        with open(documents_path + 'train_transformed_data.pickle', 'rb') as transformed_data_file:
          train_transformed_data = pickle.load(transformed_data_file)
      except IOError:
        print('*'*8, 'transformed data for training does not exist,  preparing it might take a while', '*'*8)
        train_transformed_data= self.transform_data()

      model = self.fit(train_transformed_data)      
      x_eval, y_eval = self.prepare_data_for_classifier(transformed_data, labels)
      y_pred = model.predict(np.array(x_eval))

      score = accuracy_score(y_eval, y_pred)
      print(score)
      return score

  def query(self, question, paragraph):
      # convert given paragraph and question to the format that is acceptable by other funtions   
      paragraph_dict = {0: paragraph}
      question_answer_dict = {'question':[question],
                              'paragraph_id':[0]
                                }

      # replace questions with refrmed ones
      question_answer_dict['question'] = self.POS_tagger(question_answer_dict['question'])
      # vectorize questions and paragraphs sentences
      vectorized_data = self.sent_vectorize(paragraph_dict, question_answer_dict, train= False)

      # calculate distances
      cosine_similarity  = self.calculate_vector_distances(1, vectorized_data, question_answer_dict, train= False)
      euclidean_distance = self.calculate_vector_distances(2,  vectorized_data, question_answer_dict, train= False) 
      dot_product = self.calculate_vector_distances(3, vectorized_data, question_answer_dict, train= False)
      # compare roots 
      compared_roots_data = self.compare_roots(paragraph_dict, question_answer_dict, train= False)
      # wh features
      wh_feature_list = self.check_wh_presence(question_answer_dict, train= False)
      # extract linguistic feature
      linguistic_feature = self.extract_linguistic_features(paragraph_dict, question_answer_dict, train= False)

      # transform data 
      transformed_data = self.transform_data(sentence_linguistic_feature= linguistic_feature['paragraph'], 
                      cosine_similarity_dict = cosine_similarity, euclidean_distance_dict = euclidean_distance,
                      dot_product_dict = dot_product, root_comparision_dict = compared_roots_data,
                      wh_presence_list = wh_feature_list, question_linguistic_feature = linguistic_feature['question'], train= False,)

    
      try: # try to load model
        with open(documents_path + 'GNB_model.pickle', 'rb') as model_file:
          model = pickle.load(model_file)
          
          # to storre given idenx to each label by model
          class_dict = dict()
          for i, label in enumerate(model.classes_):
            class_dict.update({label: i})

          paragraph_id = 0
          question_index = 0
          # get prob to choose the sentence that has been assigned to calss 1 most confidently
          predict_prob = model.predict_proba(np.array(transformed_data[paragraph_id][question_index]))

          sentence_index = -1
          max_val = -1.0
          # get index of sentence which has been assigned to class 1 most confidently 
          for index, prob in enumerate(predict_prob):
            if prob[class_dict[1]] > max_val:
              max_val = prob[class_dict[1]]
              sentence_index = index
          return self.sent_tokenize(paragraph_dict[0])[sentence_index]

      except IOError:
        print('*'*8, 'model has not been trained, training process might take a while', '*'*8)
        try:  # try to open transformed data file
          train_transformed_data= dict()
          with open(documents_path + 'train_transformed_data.pickle', 'rb') as transformed_data_file:
            train_transformed_data = pickle.load(transformed_data_file)
        except IOError:
          print('*'*8, 'transformed data for training does not exist,  preparing it might take a while','*'*8)
          train_transformed_data= self.transform_data()

        model = self.fit(train_transformed_data)      
    
        # to storre given idenx to each label by model
        class_dict = dict()
        for i, label in enumerate(model.classes_):
          class_dict.update({label: i})

        paragraph_id = 0
        question_index = 0
        # get prob to choose the sentence that has been assigned to calss 1 most confidently
        predict_prob = model.predict_proba(np.array(transformed_data[paragraph_id][question_index]))

        sentence_index = -1
        max_val = -1.0
        # get index of sentence which has been assigned to class 1 most confidently 
        for index, prob in enumerate(predict_prob):
          if prob[class_dict[1]] > max_val:
            max_val = prob[class_dict[1]]
            sentence_index = index
        return self.sent_tokenize(paragraph_dict[0])[sentence_index]


# how many sentence each paraghrap includes at most extracted by checking each k 
# and how many data they contains 
k=7
# construct an object from Answer_Sentence_Detector
sample_object = Answer_Sentence_Detector(data_path + train_file, k)



# call sent_vectorize function
vectorized_data = sample_object.sent_vectorize(sample_object.paragraph_dict,sample_object.question_answer_dict)

# call calculate_vector_distances
cosine_similarity = sample_object.calculate_vector_distances(1, vectorized_data, sample_object.question_answer_dict)
euclidean_distnace = sample_object.calculate_vector_distances(2, vectorized_data, sample_object.question_answer_dict)
dot_product = sample_object.calculate_vector_distances(3, vectorized_data, sample_object.question_answer_dict)

# call compare roots function
root_comparision = sample_object.compare_roots(samle_object.paragraph_dict, sample_object.question_answer_dict)

# call check_wh_presence function
wh_precense_list = sample_object.check_wh_presence(sample_object.paragraph_dict)

# call extract_linguistic_feature
linguistic_feature = sample_object.extract_linguistic_features(sample_object.paragraph_dict, sample_object.question_answer_dict)

# call transform_data function
transformed_data = sample_object.transform_data()

# call fit function
model = sample_object.fit(transformed_data)

# call evaluate function
accuracy = sample_object.evaluate(data_path + eval_file) 

# call query function
paragraph = sample_object.paragraph_dict[0] 
question = sample_object.question_answer_dict['question'][0]
answer_sentence = sample_object.query(question, paragraph)

with open(documents_path + 'sample_object.pickle', 'wb') as sample_object_file:
  pickle.dump(sample_object, sample_object_file, protocol= pickle.HIGHEST_PROTOCOL)


### data analysis

* save all NER that our training file has to be able to decide to use which of them 

In [None]:
from stanza.server import CoreNLPClient
ner_dict=dict()
with CoreNLPClient(
        annotators=['ner'],
        timeout=30000,
        memory='4G', endpoint='http://localhost:9001', be_quiet=True) as client:
        
        for i,doc in enumerate(sample.paragraph_dict.values()):
          sen_ls = sample.sent_tokenize(doc)
          for sentence in sen_ls:
            nn= client.annotate(sentence)
            for sen in nn.sentence:
              for token in sen.token:
                ner = token.ner
                if ner in ner_dict:
                  ner_dict[ner].append(token.value)
                else:
                  ner_dict.update({ner:[token.value]})
          if i % 1000 == 0:
            print('{:10s}:{:12d}  {:10s}  {:12d}'.format('processed',i,'over',len(sample.paragraph_dict)))


with open(documents_path+'All_NER.json','w') as updated_qustions_file:
  json.dump(ner_dict,updated_qustions_file)


* see which NER CoreNLPCLient has and which we've been used

In [None]:
import json

from stanza.server import CoreNLPClient


with open(documents_path + 'All_NER.json', 'r', encoding = 'utf8') as NER_file:
  ner_d = json.load(NER_file)
  for n in ner_d.keys():
      if n in linguistic_features:
        print('{:10s}  {}'.format(n,True))
      else:
        print('{:10s}  {}'.format(n,False))
{'RELIGION': 2, 'PERSON': 2, 'CITY': 0, 'COUNTRY': 0, 'TITLE': 2, 'DATE': 1, 'LOCATION': 0,  'ORGANIZATION': 0, 'IDEOLOGY': 2, 'NATIONALITY': 2, 'STATE_OR_PROVINCE': 2, 'CAUSE_OF_DEATH': 2, 'TIME': 1, 'CRIMINAL_CHARGE': 2}

O           False
RELIGION    True
PERSON      True
MISC        False
CITY        True
COUNTRY     True
TITLE       True
DATE        True
NUMBER      False
LOCATION    True
ORGANIZATION  True
SET         False
DURATION    False
IDEOLOGY    True
NATIONALITY  True
ORDINAL     False
STATE_OR_PROVINCE  True
PERCENT     False
URL         False
MONEY       False
CAUSE_OF_DEATH  True
TIME        True
CRIMINAL_CHARGE  True


{'CAUSE_OF_DEATH': 2,
 'CITY': 0,
 'COUNTRY': 0,
 'CRIMINAL_CHARGE': 2,
 'DATE': 1,
 'IDEOLOGY': 2,
 'LOCATION': 0,
 'NATIONALITY': 2,
 'ORGANIZATION': 0,
 'PERSON': 2,
 'RELIGION': 2,
 'STATE_OR_PROVINCE': 2,
 'TIME': 1,
 'TITLE': 2}