In [56]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from streamlit_utils import *
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


In [57]:
df_location = get_locations_df_from_subject("Wildfire",2)
df_location

Successfully scraped :  20  links
Done scraping
There are 12 usable articles


Unnamed: 0,Title,Link,Date,Clean_content,Sentences,Location
0,Spreading like Wildfire: The Rising Threat of ...,https://reliefweb.int/report/world/spreading-w...,2022-02-23,WorldNumber of wildfires to rise by 50% by 21...,[WorldNumber of wildfires to rise by 50% by 21...,"[Nairobi, Norway, Stockholm, Sweden, Media]"
1,Wildfires likely to increase by a third by 205...,https://www.theguardian.com/environment/2022/f...,2022-02-23,Even previously unaffected countries likely to...,[Even previously unaffected countries likely t...,"[California, Australia, Canada, the United Sta..."
2,Climate Change Could Increase Risk of Wildfire...,https://www.nytimes.com/2022/02/23/climate/cli...,2022-02-23,Advertisement Supported by Worsening heat and ...,[Advertisement Supported by Worsening heat and...,"[Australia, United States, Canada, Russia, Ind..."
3,British Columbia set to move to year-round BC ...,https://globalnews.ca/news/8635723/bc-wildfire...,2022-02-23,As wildfire seasons in British Columbia contin...,[As wildfire seasons in British Columbia conti...,"[British Columbia, B.C.]"
4,Sheriff's Office performing wildfire mitigatio...,https://www.gjsentinel.com/news/western_colora...,2022-02-23,The Mesa County Sheriff Office and Mesa Count...,[The Mesa County Sheriff Office and Mesa Count...,[]
5,"Book reflects on Vernon, B.C.’s 2021 wildfire ...",https://globalnews.ca/news/8636262/vernon-bc-w...,2022-02-23,"Last summer the Vernon, B.C., area was threate...","[Last summer the Vernon, B.C., area was threat...","[Vernon, B.C., British Columbia, Okanagan, Nor..."
6,Western Kentucky prepares for wildfire season ...,https://www.wkyufm.org/2022-02-22/western-kent...,2022-02-23,Spring is approaching for Kentucky with fears ...,[Spring is approaching for Kentucky with fears...,"[Kentucky, U.S., North Carolina, Florida]"
7,‘Catastrophe’ in Argentina as wildfires burn f...,https://www.aljazeera.com/news/2022/2/21/catas...,2022-02-23,Many towns are post­ing their high­est tem­per...,[Many towns are post­ing their high­est tem­pe...,[]
8,Texas Panhandle wildfire destroys homes in Ama...,https://www.amarillo.com/story/news/local/2022...,2022-02-23,A wildfire in northwest Amarillo has claimed s...,[A wildfire in northwest Amarillo has claimed ...,[]
9,Colorado lawmakers debating multiple wildfire ...,https://www.koaa.com/news/covering-colorado/co...,2022-02-23,DENVER There an awful lot of work up here; the...,[DENVER There an awful lot of work up here; th...,"[DENVER, Colorado]"


In [58]:
# Apply the tokenizer to the input text, treating them as a text-pair.
def tokenize_input_text(question, input_text):

    input_ids = tokenizer.encode(question, input_text)

    return input_ids

In [59]:
def get_segment_ids(input_ids):
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    return segment_ids

In [60]:
def get_scores(input_ids,segment_ids):
    # Run through the model.
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                    token_type_ids=torch.tensor([segment_ids]),
                                    return_dict = False) # The segment IDs to differentiate question from answer_text
    return start_scores, end_scores

In [61]:
# Find the tokens with the highest `start` and `end` scores.
def get_answer(start_scores,end_scores, input_ids):
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Combine the tokens in the answer and print it out.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = ' '.join(tokens[answer_start:answer_end+1])

    return answer

In [62]:
# Better version

def get_answer_clean(start_scores, end_scores, input_ids):
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]
    return answer

In [63]:
def answer_from_question(question, input_text):
    input_ids = tokenize_input_text(question, input_text)
    segment_ids = get_segment_ids(input_ids)
    start_scores, end_scores = get_scores(input_ids, segment_ids)
    
    return get_answer_clean(start_scores, end_scores, input_ids)

In [92]:
def get_first_senteces_from_sentences_list(sentences_list, nb_sentences):
    return "".join(sentences_list[0:nb_sentences])

In [93]:
question = "What occured ?"

input_text = get_first_senteces_from_sentences_list(df_location["Sentences"][0],10)

print(input_text)

answer_from_question(question,input_text)

WorldNumber of wildfires to rise by 50% by 2100 and governments are not prepared, experts warn Nairobi, 23 February, 2022    Climate change and land use change are projected to make wildfires more frequent and intense, with a global increase of extreme fires of up to 14 per cent by 2030, 30 per cent by the end of 2050 and 50 per cent by the end of the century, according to a new report by the UN Environment Programme  and GRID Arendal.The paper calls for a radical change in government spending on wildfires, shifting their investments from reaction and response to prevention and preparedness.The report, Spreading like Wildfire: The Rising Threat of Extraordinary Landscape Fires, finds an elevated risk even for the Arctic and other regions previously unaffected by wildfires.The report is released before the resumed 5^th^ session of the UN Environment Assembly  convenes in Nairobi, between 28 February and 2 March, 2022.The publication calls on governments to adopt a new 'Fire Ready Formul