### Import

In [1]:
import torch
import transformers
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import nltk

import pickle
#utile pour le pos tagging
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

### Load Model

In [10]:
start()
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
stop()

Time:  8.08325330000001


In [9]:
pickle.dump(tokenizer, open('tokenizer.ask', 'wb'))
pickle.dump(model, open('model.ask', 'wb'))

In [11]:
start()
tokenizer = pickle.load(open('tokenizer.ask', 'rb'))
model = pickle.load(open('model.ask', 'rb'))
stop()

Time:  2.0859068000000036


In [12]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
#move model to device
model = model.to(device)

cuda


***
### Model function
Use loaded BERT model to return answer<br>
**Input** : Question, Context text <br>
**return** : Answer

In [13]:
def generate_answer(question, answer_text):
    #print("I'm looking for an aswer, wait please ...")
    # == Tokenize ==
    # use a python dictonary so run on CPU
    # Apply the tokenizer to the input text, treating them as a text-pair.
    #print("-Tokenization")
    input_ids = tokenizer.encode(question, answer_text)
    #print('The input has a total of {:} tokens.'.format(len(input_ids)))

    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    # == Set Segment IDs ==
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # == Run Model ==
    # Run our example through the model.
    # by default on CPU, use model.to(device) to select GPU !?
    #print("-Forward pass on the model")
    
    #move tensor to device
    input_ids_tensor = torch.tensor([input_ids]).to(device)
    segment_ids_tensor = torch.tensor([segment_ids]).to(device)
    
    start_scores, end_scores = model(input_ids_tensor, # The tokens representing our input text.
                                 token_type_ids=segment_ids_tensor) # The segment IDs to differentiate question from 
    # ici retourne un score pour chaque token
    
    # donc on applique un argmax pour trouver le plus probable
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    
    # Récupére le socre
    start_score = float(start_scores[0,answer_start])
    end_score = float(end_scores[0,answer_end])
    
    
    # == Print Answer without ## ==
    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
    
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
    
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    return answer, start_score+end_score

    

In [42]:
generate_answer("Who is Barack Obama", "Barack Obama is an American politician and attorney who served as the 44th president of the United States from 2009 to 2017. A member of the Democratic Party, Obama was the first African-American president of the United States. He previously served as a U.S. senator from Illinois from 2005 to 2008 and an Illinois state senator from 1997 to 2004.")

I'm looking for an aswer, wait please ...
-Tokenization
-Forward pass on the model


('barack obama', 6.05471658706665)

In [10]:
input_ids = tokenizer.encode("i don't know", "U.K isn't a a eee ee thing")
print(input_ids)
tokens = tokenizer.convert_ids_to_tokens(input_ids)
print(tokens)

[101, 1045, 2123, 1005, 1056, 2113, 102, 1057, 1012, 1047, 3475, 1005, 1056, 1037, 1037, 25212, 2063, 25212, 2518, 102]
['[CLS]', 'i', 'don', "'", 't', 'know', '[SEP]', 'u', '.', 'k', 'isn', "'", 't', 'a', 'a', 'ee', '##e', 'ee', 'thing', '[SEP]']


***
### Question Processing
Extract subjet _(and more?)_ from the question

In [8]:
def extract_subj(question):
    subject = None
    token = nltk.word_tokenize(question)
    #print(token)
    pos_token = nltk.pos_tag(token)
    for item in pos_token:
        if item[1] == 'NN':
            subject = item[0]
    if subject is not None:
        print("Subject found: " + subject)
    else:
        print("Subject not found 😔\n Rephrase the question or try another one")
    return  subject



In [4]:
import spacy
import en_core_web_sm

import truecase

nlp = en_core_web_sm.load()
#!python -m spacy download en_core_web_sm
#nlp = spacy.load("en_core_web_sm")

def extract_subject(question):
    #noun to not take in count
    subj_list = []
    osef_list = ['who','why','what','when','which','how']
    doc = nlp(question)
    
    ents = doc.ents
    nouns = doc.noun_chunks
    # == Check for Named Entity ==
    if len(ents) != 0:
        for ent in doc.ents:
            subj_list.append(ent.text)
            print("Named entity: " + ent.text) #, ent.start_char, ent.end_char, ent.label_
    # == Check for Noun Chunk ==
    elif len(list(nouns)) != 0:
        for item in nouns:
            if str(item) not in osef_list:
                subj_list.append(str(item))
                print("Noun chunck: " + str(item))
                
    #print("Subject found: " + str(item))
    #return str(item)
    #print("Subject not found 😔\n Rephrase the question or try another one")
    if len(subj_list) != 0:
        print("Subject selected: " + subj_list[-1])
        return subj_list[-1]
    else:
        print("Subject not found 😔\n Rephrase the question or try another one")
        return None
#test
#extract_subject_with_spacy('which is the most common use of opt-in e-mail marketing ?')

ModuleNotFoundError: No module named 'truecase'

In [10]:
question = "how the dog dies?"

#question = truecase.get_true_case(question)
print(question)

extract_subject(question)

how the dog dies?


NameError: name 'extract_subject' is not defined

In [11]:
import spacy
import nltk

nlp = spacy.load("en_core_web_sm")
doc = nlp("In")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
    

### Wikipedia API
Try to found the most relevant context text to give as BERT input. <br>
Get a wikipedia article, and scrap it

In [4]:
import wikipedia #use to search article name
import wikipediaapi #use to acess article page and content
import en_core_web_sm
import spacy

import time

import en_core_web_sm
nlp = en_core_web_sm.load()

In [5]:
from spacy.tokenizer import Tokenizer
from nltk.tokenize import word_tokenize

text = "Firt senteces. I'dont if U.K is in Europe. But osef .Lol"
doc = nlp(text)
sentences = []
for sent in doc.sents:
    sentences.append(sent.text)
for s in sentences:
    print(s.split(" "))
#word_tokenize(text)

['Firt', 'senteces.']
["I'dont", 'if', 'U.K', 'is', 'in', 'Europe.']
['But', 'osef', '.Lol']


In [6]:
wiki_retrivial = wikipedia.search('computing programming')
print(wiki_retrivial[0])

wiki = wikipediaapi.Wikipedia(
        language='en',
        #extract_format=wikipediaapi.ExtractFormat.WIKI
        #extract_format=wikipediaapi.ExtractFormat.HTML
        )

page = wiki.page(wiki_retrivial[0])
print(page.exists())
page.title
#page.text
#page.summary
page.fullurl

Concurrent computing
True


'https://en.wikipedia.org/wiki/Concurrent_computing'

In [7]:
timer = 0

def start():
    global timer
    timer = time.perf_counter()
    
def stop(message='Time: '):
    global timer
    print(message, time.perf_counter() - timer)
    

In [8]:
def get_sections_list(page):
    osef_list = ['Sources', 'Further reading', 'External links']
    def get_sections(sections, sections_list, level=0):
            for s in sections:
                    #print("%s: %s - %s" % ("*" * (level + 1), s.title, len(s.text)))
                    #check if there is text and if section is usefull
                    if len(s.text) != 0 and s.title not in osef_list:
                        sections_list.append(s.text)
                    get_sections(s.sections, sections_list, level + 1)
                    
    sections_list = []
    sections_list.append(page.summary)
    get_sections(page.sections, sections_list)
    return sections_list

def get_paragraph(page):
    result = []
    result = get_sections_list(page)
    
    paragraph = []
    for section in result:
        for item in section.split("\n"):
            #check len <512 // 400-450
            if len(word_tokenize(item)) < 400:
                paragraph.append(item)
    return paragraph


In [10]:
print(paragraph[0])

NameError: name 'paragraph' is not defined

In [12]:
wikipedia.search('hippo')

['Hippopotamus',
 'Augustine of Hippo',
 'Pygmy hippopotamus',
 'House Hippo',
 'Hippo Regius',
 'Hippo Campus',
 'Hippo CMS',
 'Hungry Hungry Hippos',
 'Hippo (disambiguation)',
 'Hippos']

In [15]:
# == Loop == 




start()
page = wiki.page(wikipedia.search('hippo')[0])
print(page.exists())
stop()
start()

paragraph = get_paragraph(page)

stop()

answers = []
scores = []
timer = time.perf_counter() 
for p in paragraph:
    answer, score = generate_answer("how a hippo scratches", p)
    answers.append(answer)
    scores.append(score)
print("Model run Time: ", time.perf_counter() - timer)
    
max_value = max(scores)

index = scores.index(max_value)
print(max_value)
print(answers[index])
print(paragraph[index])
print(page.fullurl)

True
Time:  0.23894789999999944
Time:  0.3071918999999923
Model run Time:  2.9706179000000077
9.52122974395752
hippos mark their territory by defecation
Hippos mark their territory by defecation.  While depositing the faeces, hippos spin their tails to distribute their excrement over a greater area. "Yawning" serves as a threat display. When fighting, males use their incisors to block each other's attacks and their large canines to inflict injuries.  When hippos become over-populated or a habitat is reduced, males sometimes attempt infanticide, but this behaviour is not common under normal conditions. Incidents of hippo cannibalism have been documented, but this is believed to be the behaviour of distressed or sick hippos.Hippos appear to communicate vocally, through grunts and bellows, and they may practice echolocation, but the purpose of these vocalisations is currently unknown. Hippos have the unique ability to hold their heads partially above the water and send out a cry that trav

In [64]:
sorted(zip(scores, answers), reverse=True)[:10]

[(18.04034423828125, 'gdscript'),
 (15.488117694854736, 'object - oriented'),
 (15.314013481140137, 'java'),
 (14.860849380493164, 'julia'),
 (14.193211555480957, 'coffeescript'),
 (13.607266902923584, 'go'),
 (13.453736782073975, 'java'),
 (13.433351516723633, 'common lisp , scheme , or ruby'),
 (13.278484344482422, 'c or pascal'),
 (13.127068996429443, 'ecmascript / javascript')]

In [19]:
wiki_retrivial = wikipedia.search('python')
print(wiki_retrivial[0])

Python (programming language)


In [None]:
subj = "Star Wars"
search = wikipedia.search(subj)
print(search)
#wikipedia.suggest("Lil Pump")
#print(wikipedia.suggest("OMG"))
page = wikipedia.page(search[0])
page.title
#'\n'
#page.content
#page.url
#page.title
#page.images[0]

#try:
#    page = wikipedia.page("OMG (disambiguation)")
#except wikipedia.exceptions.DisambiguationError as e:
#    print(e.options)



In [None]:
text = wikipedia.summary('star wars')
#list = text.split('.')
print(text)

***
### Visualization

In [12]:
from IPython.display import display
from ipywidgets import widgets

#Widgets layout difinition
layout = widgets.Layout(width='400px', height='0px', margin='100px 0 0 100px')
bLayout = widgets.Layout(width='50px', height='28px', margin='100px 0 0 0px')
outLayoutPropre = widgets.Layout(width='480px', height='100px', margin='50px 0 100px 100px')
outLayoutTest = widgets.Layout(width='450px', height='auto', margin='50px 0 100px 100px')
#titleLayout = widgets.Layout(width='450px', height='auto', margin='0px 0 0px 100px')
 
#Widgets object definition
text = widgets.Text(layout=layout)
button = widgets.Button(description = 'Ask', layout = bLayout)
out = widgets.Output(layout=outLayoutTest)#layout=outLayout
#out = widgets.HTML(layout = outLayout, value= '<style>.text {width: 480px; heigh: 100px;}</style> <p class="text">'+ out_value +' </p>')
 
def button_on_click(self):
    with out:
        out.clear_output()
        
        subject = extract_subject(question=text.value)
        
        if subject is not None:
            
            context = wikipedia.summary(subject)
            
            answer, score = generate_answer(text.value, context[:2000])
            #out.clear_output()
            print()
            print("Here what i found: \n"+ answer)
            print()
            print("Score: " +str(score))
        else:
            pass
        
button.on_click(button_on_click)

display(widgets.HBox((text, button,)))
display(out)

HBox(children=(Text(value='', layout=Layout(height='0px', margin='100px 0 0 100px', width='400px')), Button(de…

Output(layout=Layout(height='auto', margin='50px 0 100px 100px', width='450px'))