### Import

In [1]:
import torch
import transformers
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import nltk
#utile pour le pos tagging
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

### Load Model

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
#move model to device
model = model.to(device)

cuda


***
### Model function
Use loaded BERT model to return answer<br>
**Input** : Question, Context text <br>
**return** : Answer

In [4]:
def generate_answer(question, answer_text):
    print("I'm looking for an aswer, wait please ...")
    # == Tokenize ==
    # use a python dictonary so run on CPU
    # Apply the tokenizer to the input text, treating them as a text-pair.
    print("-Tokenization")
    input_ids = tokenizer.encode(question, answer_text)
    #print('The input has a total of {:} tokens.'.format(len(input_ids)))

    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # == Set Segment IDs ==
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # == Run Model ==
    # Run our example through the model.
    # by default on CPU, use model.to(device) to select GPU !?
    print("-Forward pass on the model")
    
    #move tensor to device
    input_ids_tensor = torch.tensor([input_ids]).to(device)
    segment_ids_tensor = torch.tensor([segment_ids]).to(device)
    
    start_scores, end_scores = model(input_ids_tensor, # The tokens representing our input text.
                                 token_type_ids=segment_ids_tensor) # The segment IDs to differentiate question from 
    # ici retourne un score pour chaque token
    
    # donc on applique un argmax pour trouver le plus probable
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    
    # Récupére le socre
    start_score = float(start_scores[0,answer_start])
    end_score = float(end_scores[0,answer_end])
    
    
    # == Print Answer without ## ==
    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
    
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
    
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    return answer, start_score+end_score

    

In [6]:
generate_answer("What is IPO?", "Google was founded in 1998 by Larry Page and Sergey Brin while they were Ph.D. students at Stanford University in California. Together they own about 14 percent of its shares and control 56 percent of the stockholder voting power through supervoting stock. They incorporated Google as a privately held company on September 4, 1998. An initial public offering (IPO) took place on August 19, 2004, and Google moved to its headquarters in Mountain View, California, nicknamed the Googleplex. In August 2015, Google announced plans to reorganize its various interests as a conglomerate called Alphabet Inc. Google is Alphabet's leading subsidiary and will continue to be the umbrella company for Alphabet's Internet interests. Sundar Pichai was appointed CEO of Google, replacing Larry Page who became the CEO of Alphabet.")

I'm looking for an aswer, wait please ...
-Tokenization
-Forward pass on the model


('initial public offering', 14.472150325775146)

In [6]:
input_ids = tokenizer.encode("i don't know", "U.K isn't a thing")
print(input_ids)
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(tokens)

[101, 1045, 2123, 1005, 1056, 2113, 102, 1057, 1012, 1047, 3475, 1005, 1056, 1037, 2518, 102]
['[CLS]', 'i', 'don', "'", 't', 'know', '[SEP]', 'u', '.', 'k', 'isn', "'", 't', 'a', 'thing', '[SEP]']


***
### Question Processing
Extract subjet _(and more?)_ from the question

In [6]:
def extract_subj(question):
    subject = None
    token = nltk.word_tokenize(question)
    #print(token)
    pos_token = nltk.pos_tag(token)
    for item in pos_token:
        if item[1] == 'NN':
            subject = item[0]
    if subject is not None:
        print("Subject found: " + subject)
    else:
        print("Subject not found 😔\n Rephrase the question or try another one")
    return  subject



In [7]:
import spacy
import en_core_web_sm

import truecase

nlp = en_core_web_sm.load()
#!python -m spacy download en_core_web_sm
#nlp = spacy.load("en_core_web_sm")

def extract_subject(question):
    #noun to not take in count
    subj_list = []
    osef_list = ['who','why','what','when','which','how']
    doc = nlp(question)
    
    ents = doc.ents
    nouns = doc.noun_chunks
    # == Check for Named Entity ==
    if len(ents) != 0:
        for ent in doc.ents:
            subj_list.append(ent.text)
            print("Named entity: " + ent.text) #, ent.start_char, ent.end_char, ent.label_
    # == Check for Noun Chunk ==
    elif len(list(nouns)) != 0:
        for item in nouns:
            if str(item) not in osef_list:
                subj_list.append(str(item))
                print("Noun chunck: " + str(item))
                
    #print("Subject found: " + str(item))
    #return str(item)
    #print("Subject not found 😔\n Rephrase the question or try another one")
    if len(subj_list) != 0:
        print("Subject selected: " + subj_list[-1])
        return subj_list[-1]
    else:
        print("Subject not found 😔\n Rephrase the question or try another one")
        return None
#test
#extract_subject_with_spacy('which is the most common use of opt-in e-mail marketing ?')

ModuleNotFoundError: No module named 'truecase'

In [None]:
question = "how the dog dies?"

#question = truecase.get_true_case(question)
print(question)

extract_subject(question)

In [12]:
import spacy
import nltk

nlp = spacy.load("en_core_web_sm")
doc = nlp("In")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
    

### Wikipedia API
Try to found the most relevant context text to give as BERT input. <br>
Get a wikipedia article, and scrap it

In [7]:
import wikipedia #use to search article name
import wikipediaapi #use to acess article page and content
import en_core_web_sm
import spacy

import time

In [13]:
from spacy.tokenizer import Tokenizer
from nltk.tokenize import word_tokenize

text = "Firt senteces. I'dont if U.K is in Europe. But osef .Lol"
doc = nlp(text)
sentences = []
for sent in doc.sents:
    sentences.append(sent.text)
for s in sentences:
    print(s.split(" "))
#word_tokenize(text)

['Firt', 'senteces.']
["I'dont", 'if', 'U.K', 'is', 'in', 'Europe.']
['But', 'osef', '.Lol']


In [45]:
wiki_retrivial = wikipedia.search('lil wayne')
print(wiki_retrivial[0])

wiki = wikipediaapi.Wikipedia(
        language='en',
        #extract_format=wikipediaapi.ExtractFormat.WIKI
        )

page = wiki.page(wiki_retrivial[0])
print(page.exists())
page.title
#page.text
#page.summary
page.fullurl

Lil Wayne
True


'https://en.wikipedia.org/wiki/Lil_Wayne'

In [18]:
def get_sections(sections, level=0):
        for s in sections:
                print("%s: %s - %s" % ("*" * (level + 1), s.title, len(s.text)))
                get_sections(s.sections, level + 1)

                

def cut(text):
    pass
    

page = wiki.page('python')  
get_sections(page.sections)

*: Computing - 150
*: People - 286
*: Roller coasters - 177
*: Vehicles - 105
*: Weaponry - 167
*: Other uses - 266
*: See also - 84


['Barack Obama',
 'Barack Obama Sr.',
 'Family of Barack Obama',
 'Presidency of Barack Obama',
 'Barack Obama citizenship conspiracy theories',
 'Electoral history of Barack Obama',
 'Speeches of Barack Obama',
 'Barack Obama 2008 presidential campaign',
 'Early life and career of Barack Obama',
 'List of federal judges appointed by Barack Obama']

In [10]:
# == Loop == 

timer = time.perf_counter() 

page = wiki.page('donald trump')
text = page.summary


cuted = text.split("\n")

answers = []
scores = []
timer = time.perf_counter() 
for p in cuted:
    answer, score = generate_answer("when is Donal trump born ?", p)
    answers.append(answer)
    scores.append(score)
print(time.perf_counter() - timer)
    
max_value = max(scores)

index = scores.index(max_value)
print(max_value)
print(answers[index])

NameError: name 'wiki' is not defined

In [None]:
timer = time.perf_counter() 
cuted = text.split("\n")
print(time.perf_counter() - timer)

In [16]:
max_value = max(scores)

index = scores.index(max_value)
print(index)

print(answers)

print(answers[index])

0
['june 14 , 1946', '1971', '2016', '2019', '2016', '2020']
june 14 , 1946


In [106]:
print(len(text))
timer = time.perf_counter() 
a1,s1 = generate_answer("when is Donal trump born ?", text[:2000])
a2,s2 = generate_answer("when is Donal trump born ?", text[2000:])
print(time.perf_counter() - timer)

print(a1," || ", s1)
print(a2," || ", s2)

3661
I'm looking for an aswer, wait please ...
-Tokenization
-Forward pass on the model
I'm looking for an aswer, wait please ...
-Tokenization
-Forward pass on the model
14.267158899999686
june 14 , 1946  ||  8.51047420501709
[SEP]  ||  2.7942256927490234


['Firt', 'senteces.']
["I'dont", 'if', 'U.K', 'is', 'in', 'Europe.']
['But', 'osef', '.Lol']


'https://en.wikipedia.org/wiki/Lil_Wayne'

c   e ci est un te.st!
c   e ci est un te st!
c   e ci est un te st!
c   e ci est un te st!
c   e ci est un te st!
c   e ci est un te st 
c   e ci est un te st 
['c', '', '', 'e', 'ci', 'est', 'un', 'te', 'st', '']
*: Computing - 150
*: People - 286
*: Roller coasters - 177
*: Vehicles - 105
*: Weaponry - 167
*: Other uses - 266
*: See also - 84


In [16]:
subj = "Star Wars"
search = wikipedia.search(subj)
print(search)
#wikipedia.suggest("Lil Pump")
#print(wikipedia.suggest("OMG"))
page = wikipedia.page(search[0])
page.title
#'\n'
#page.content
#page.url
#page.title
#page.images[0]

#try:
#    page = wikipedia.page("OMG (disambiguation)")
#except wikipedia.exceptions.DisambiguationError as e:
#    print(e.options)



['Star Wars', 'Star Wars (film)', 'List of Star Wars films', 'Star Wars: The Rise of Skywalker', 'Star Wars Rebels', 'List of Star Wars characters', 'Star Wars: The Last Jedi', 'Star Wars: The Force Awakens', 'Star Wars: Squadrons', 'The Child (Star Wars)']


PageError: Page id "start wars" does not match any pages. Try another id!

In [17]:
text = wikipedia.summary('star wars')
#list = text.split('.')
print(text)

PageError: Page id "start wars" does not match any pages. Try another id!

***
### Visualization

In [12]:
from IPython.display import display
from ipywidgets import widgets

#Widgets layout difinition
layout = widgets.Layout(width='400px', height='0px', margin='100px 0 0 100px')
bLayout = widgets.Layout(width='50px', height='28px', margin='100px 0 0 0px')
outLayoutPropre = widgets.Layout(width='480px', height='100px', margin='50px 0 100px 100px')
outLayoutTest = widgets.Layout(width='450px', height='auto', margin='50px 0 100px 100px')
#titleLayout = widgets.Layout(width='450px', height='auto', margin='0px 0 0px 100px')
 
#Widgets object definition
text = widgets.Text(layout=layout)
button = widgets.Button(description = 'Ask', layout = bLayout)
out = widgets.Output(layout=outLayoutTest)#layout=outLayout
#out = widgets.HTML(layout = outLayout, value= '<style>.text {width: 480px; heigh: 100px;}</style> <p class="text">'+ out_value +' </p>')
 
def button_on_click(self):
    with out:
        out.clear_output()
        
        subject = extract_subject(question=text.value)
        
        if subject is not None:
            
            context = wikipedia.summary(subject)
            
            answer, score = generate_answer(text.value, context[:2000])
            #out.clear_output()
            print()
            print("Here what i found: \n"+ answer)
            print()
            print("Score: " +str(score))
        else:
            pass
        
button.on_click(button_on_click)

display(widgets.HBox((text, button,)))
display(out)

HBox(children=(Text(value='', layout=Layout(height='0px', margin='100px 0 0 100px', width='400px')), Button(de…

Output(layout=Layout(height='auto', margin='50px 0 100px 100px', width='450px'))