### Import

In [2]:
import torch
import transformers
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import wikipedia
import nltk
import truecase
import spacy
import en_core_web_sm
#utile pour le pos tagging
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

### Load Model

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [4]:
nlp = en_core_web_sm.load()
doc = nlp('From what direction does the sun rise in the morning ?')
for token in doc : 
    print(token.text, token.dep_)
#for token in doc: 
    #print(token.text)
    #print(token.dep_)
    #print('\n')


From ROOT
what det
direction dobj
does aux
the det
sun nsubj
rise pcomp
in prep
the det
morning pobj
? punct


***
### Model function
Use loaded BERT model to return answer<br>
**Input** : Question, Context text <br>
**return** : Answer

In [5]:
def generate_answer(question, answer_text):
    print("I'm looking for an answer, please wait ...")
    # == Tokenize ==
    # use a python dictonary so run on CPU
    # Apply the tokenizer to the input text, treating them as a text-pair.
    print("-Tokenization")
    input_ids = tokenizer.encode(question, answer_text, add_special_tokens=True) #special_token -> correction bug no 102 token
    #print("input ids : ", input_ids)
    #print('The input has a total of {:} tokens.'.format(len(input_ids)))

    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # == Set Segment IDs ==
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens including the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # == Run Model ==
    # Run our example through the model.
    # by default on CPU, use model.to(device) to select GPU !?
    print("-Forward pass on the model")
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                 token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from 

    
    # donc on applique un argmax pour trouver le plus probable
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    
    #print(type(start_scores))
    #print(start_scores.size())
    #print(start_scores[0,answer_start])
    #print(end_scores[0,answer_end])
    
    # == Print Answer without ## ==
    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
    
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
    
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    return answer

    

***
### Question Processing
Extract subjet _(and more?)_ from the question

In [6]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
#!python -m spacy download en_core_web_sm
#nlp = spacy.load("en_core_web_sm")

def extract_subject_with_spacy(question):
    
    #question = truecase.get_true_case(question) #le truecaser est un peu bidon j'ai l'impression
    #print(question)
    
    subject_dict = {'subject' : '', 'infos' : []} #dictionnaire qui contiendra le sujet, et les infos complémentaires
    
    osef_list = ['who','why','what','when','which','how', 'Who','Why','What','When','Which', 'How'] #noun to not take into account
    doc = nlp(question)
    
    #on prépare une liste des noms communs (ou plus précisent chunks, qui peuvent être des groupes nominaux plus larges, des unités de sens) de la question
    
    nouns_list = []
    dep_list = []
    for noun in doc.noun_chunks :
        dep_list.append(noun.root.dep_)
        nouns_list.append(noun)
           
    #on enlève de la liste des potentiels sujets les mots interrogatifs venant de osef_list
    for noun in nouns_list : 
        if str(noun) in osef_list : 
            nouns_list.remove(noun)
                
    #on crée une liste d'entité nommées de la phrase. S'il y en a une dans la question, alors c'est le sujet
    #nn crée également une liste qui va contenir les labels de ces entités nommées, car certains types d'entités ne nous intéressent pas
    #les labels qui nous intéressent sont dans la liste relevant_labels
    
    #si il y a des entités nommées, on les utilise comme sujet et les chunks alentours comme infos supplémentaires
    #si il n'y a pas d'entités nommées, on va uniquement regarder les chunks (dans le 'else')
    
    ents_list = []
    labels_list = []
    
    relevant_labels = ['PERSON','FAC','ORG','GPE','LOC','PRODUCT','EVENT','WORK_OF_ART','LAW']
    for ent in doc.ents :
        if ent.label_ in relevant_labels : 
            ents_list.append(ent.text)
            labels_list.append(ent.label_)
            #dep_list.append(ent.dep_) #pour tests sur depencies
            
    if ents_list and labels_list : 
        print(ents_list)
        print(dep_list)
        print('subject found by ent : ', labels_list[-1] , ents_list[-1], '\n')
        subject_dict['subject'] = ents_list[-1] #on renvoie la dernière entité nommée pertinente trouvée
        for other_noun in nouns_list : 
            subject_dict['infos'].append(other_noun)
        return(subject_dict)
            
    
    else : 
            
    #si notre liste de chunks potentiels sujets est vide : pas de sujet
    #si elle est égal à 1 : pas de doute, le sujet est cet élément
    #si elle est plus grande que 1, le sujet est le deuxième élément
    #règle simpliste mais qui semble suivre la logique de la formulation d'une question : c'est souvent le second nom qui est le sujet dans les questions qui en comportent deux, j'ai l'impression 
        
        print(nouns_list)
        print(dep_list)
        if(len(nouns_list)) == 0 :
            print("subject not found, please try another formulation", '\n')
        else :
            print("subject found by noun: " + str(nouns_list[-1]), '\n')
            subject_dict['subject'] = str(nouns_list[-1]) #le sujet est le dernier chunk
            for other_noun in nouns_list[0:-1] : #dans ces cas de figure avec + d'un nom, il faudra quand même récupérer le nom qui n'est pas le sujet, pour aller l'utiliser en scrappant la page wiki du sujet
                subject_dict['infos'].append(other_noun)
            return(subject_dict)
        
    
    
        
#test

#doc = nlp("Who wrote The Lords of The Rings")
#for ents in doc.ents : 
    #print(ents.text, ents.label_)


extract_subject_with_spacy('When was ancient greek used ?')
extract_subject_with_spacy('When is Barack Obama born ?')
extract_subject_with_spacy('WHAT IS THE LAST NAME OF THE AUTHOR WHO WROTE “ROMEO AND JULIET”?')
extract_subject_with_spacy("What is the size of the moon?")
extract_subject_with_spacy("Who wrote The Lord of The Rings?")
extract_subject_with_spacy('What is the ?')
extract_subject_with_spacy('What is the sky?')
extract_subject_with_spacy('What is the color of the sky ?')
#
extract_subject_with_spacy('What is the meaning of "bread" ?') #pas de pb
extract_subject_with_spacy('What is the meaning of "omg" ?') #les sigles ne sont pas reconnus
extract_subject_with_spacy('What is the meaning of "OMG" ?') #ah bah en majuscule si
extract_subject_with_spacy('What is the meaning of "why" ?') #cas (très) rare et spécial à gérer, quand une question porte sur un mot interrogatif qui est dans osef_list (faudra mettre une condition genre si y'a + d'un mot interrogatif, ne supprimer que le premier de noun_list)
extract_subject_with_spacy('What is the third color of the french flag ?') #propre
extract_subject_with_spacy("What is the color of Nirvana's second album ?") #stylé que ça trouve le sujet dans ce genre de cas. Une future grosse tâche : trouver les articles wikipédia à partir de ce genre de paraphrase
extract_subject_with_spacy('When are hops added in the brewing process ?')
extract_subject_with_spacy('From what direction does the sun rise in the morning ?')
extract_subject_with_spacy("What is the height of Nirvana's singer ?")

[ancient greek]
['nsubj']
subject found by noun: ancient greek 

['Barack Obama']
['nsubj']
subject found by ent :  PERSON Barack Obama 

['ROMEO AND JULIET']
['attr', 'nsubj', 'pobj', 'nsubj', 'dobj', 'conj']
subject found by ent :  WORK_OF_ART ROMEO AND JULIET 

[the size, the moon]
['attr', 'nsubj', 'pobj']
subject found by noun: the moon 

['The Lord of The Rings']
['nsubj', 'dobj', 'pobj']
subject found by ent :  WORK_OF_ART The Lord of The Rings 

[]
['attr']
subject not found, please try another formulation 

[the sky]
['attr', 'nsubj']
subject found by noun: the sky 

[the color, the sky]
['attr', 'nsubj', 'pobj']
subject found by noun: the sky 

[the meaning, bread]
['attr', 'nsubj', 'pobj']
subject found by noun: bread 

[the meaning]
['attr', 'nsubj']
subject found by noun: the meaning 

[the meaning, OMG]
['attr', 'nsubj', 'pobj']
subject found by noun: OMG 

[the meaning]
['attr', 'nsubj']
subject found by noun: the meaning 

[the third color, the french flag]
['attr', 'ns

{'subject': 'Nirvana', 'infos': [the height, Nirvana's singer]}

In [14]:
def is_wh (question) : 
    doc = nlp(question)
    interrogatives = ['where', 'who','why','what','when','which','how', 'Where', 'Who','Why','What','When','Which', 'How']
    words = [word for word in doc]
    if words[0].pos_ == 'SPACE' : #si le premier token est un espace, on le supprime pour le traitement
        words = words[1:]
    
    if words[0].text in interrogatives or words[1].text in interrogatives : #si le premier ou deuxième mot de la question est un mot interrogatif, c'est une question en "wh-" : ex "When is Barack Obama born?", "In which country is Paris ?"
        return(True)
    else : 
        return(False) 

def is_polar(question) : #questions yes/no
    doc = nlp(question)
    words = [word for word in doc]
    if words[0].pos_ == 'SPACE' : 
        words = words[1:]
    
    if words[0].pos_ == 'AUX' : #si la question commence par un auxiliaire, c'est une question yes/no
        return(True)
    else : 
        return(False)
    

def is_pseudocleft (question) : #détecter les pseudocleft -> phrases qui ressemblent à des questions mais n'en sont pas : "Who is the President of Nicaragua doesn't interest me."    
    doc = nlp(question) 
    interrogatives = ['where', 'who','why','what','when','which','how', 'Where', 'Who','Why','What','When','Which', 'How']
    words = [word for word in doc]
    if words[0].pos_ == 'SPACE' : 
        words = words[1:]
    
    if words[0].text in interrogatives and words[0].head.dep_ in ["csubj", "advcl"] : #si la dépendence head du mot interrogatif est un adverbial clause ou un subject clause (en gros le sujet ou l'adverbe sont des clauses, genre le sujet de la question c'est le "What", comme dans la phrase "What she says is true"), alors on est face à un pseudocleft (ou pseudo-wh) -> c'est une règle linguistique        
        return(True) 
    else : 
        return(False)

def is_tutorial (question) : #détecter les questions tutorielles
    doc = nlp(question)
    modals = ['to', 'can', 'could', 'shall', 'should']
    words = [word for word in doc]
    if words[0].text == "How" and words[1].text in modals  : 
        return(True)
    else : 
        return(False)
    

#def is_toapp (question) : #détecte les questions adressées à l'app


    
#tests    
questions = [
            'Where is Barack Obama born ?',
            'Is the moon bigger than Mars ?',
            'What she says is true. ',
            'How to unlock a closed door ?'
            ]

for question in questions : 
    print(question, '\n' , 
          'Wh ? ', is_wh(question), '\n', 
          'Yes/No ? ', is_polar(question), '\n',
          'Pseudocleft ? ', is_pseudocleft(question), '\n',
          'Tutorial ?', is_tutorial(question), '\n')

Where is Barack Obama born ? 
 Wh ?  True 
 Yes/No ?  False 
 Pseudocleft ?  False 
 Tutorial ? False 

Is the moon bigger than Mars ? 
 Wh ?  False 
 Yes/No ?  True 
 Pseudocleft ?  False 
 Tutorial ? False 

What she says is true.  
 Wh ?  True 
 Yes/No ?  False 
 Pseudocleft ?  True 
 Tutorial ? False 

How to unlock a closed door ? 
 Wh ?  True 
 Yes/No ?  False 
 Pseudocleft ?  False 
 Tutorial ? True 



In [7]:
# == WIP ==
# découpé en paragraphe (le model a une limite de 512 token pour le text en entré)
def get_wiki_and_split(subject):
    text = wikipedia.summary(subject)
    print(len(text))
    return text


In [8]:
#wikipedia.search("salsa")

In [9]:
#text = wikipedia.summary('salsa')
#list = text.split('.')
#print(list)

***
### Visualization

In [10]:
from IPython.display import display
from ipywidgets import widgets

#Widgets layout difinition
layout = widgets.Layout(width='400px', height='0px', margin='100px 0 0 100px')
bLayout = widgets.Layout(width='50px', height='28px', margin='100px 0 0 0px')
outLayoutPropre = widgets.Layout(width='480px', height='100px', margin='50px 0 100px 100px')
outLayoutTest = widgets.Layout(width='450px', height='auto', margin='50px 0 100px 100px')
#titleLayout = widgets.Layout(width='450px', height='auto', margin='0px 0 0px 100px')
 
#Widgets object definition
text = widgets.Text(layout=layout)
button = widgets.Button(description = 'Ask', layout = bLayout)
out = widgets.Output(layout=outLayoutTest)#layout=outLayout
#out = widgets.HTML(layout = outLayout, value= '<style>.text {width: 480px; heigh: 100px;}</style> <p class="text">'+ out_value +' </p>')
 
def button_on_click(self):
    with out:
        out.clear_output()
        subject = extract_subject_with_spacy(question=text.value)['subject']
        if subject is not None:
            context = wikipedia.summary(subject)
            
            answer = generate_answer(text.value, context[:2000])
            #out.clear_output()
            print("Here is what i found: \n"+ answer)
        else:
            pass
        
button.on_click(button_on_click)

display(widgets.HBox((text, button,)))
display(out)

HBox(children=(Text(value='', layout=Layout(height='0px', margin='100px 0 0 100px', width='400px')), Button(de…

Output(layout=Layout(height='auto', margin='50px 0 100px 100px', width='450px'))