In [1]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from streamlit_utils import *
from datetime import datetime
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


In [2]:
# Apply the tokenizer to the input text, treating them as a text-pair.
def tokenize_input_text(question, input_text):

    input_ids = tokenizer.encode(question, input_text)

    return input_ids

In [3]:
def get_segment_ids(input_ids):
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    return segment_ids

In [4]:
def get_scores(input_ids,segment_ids):
    # Run through the model.
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                    token_type_ids=torch.tensor([segment_ids]),
                                    return_dict = False) # The segment IDs to differentiate question from answer_text
    return start_scores, end_scores

In [5]:
# Find the tokens with the highest `start` and `end` scores.
def get_answer(start_scores,end_scores, input_ids):
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Combine the tokens in the answer and print it out.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = ' '.join(tokens[answer_start:answer_end+1])

    return answer

In [6]:
# Better version

def get_answer_clean(start_scores, end_scores, input_ids):
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]
    return answer

In [7]:
def answer_from_question(question, input_text):
    input_ids = tokenize_input_text(question, input_text)
    segment_ids = get_segment_ids(input_ids)
    start_scores, end_scores = get_scores(input_ids, segment_ids)

    res = get_answer_clean(start_scores, end_scores, input_ids)

    if '[CLS]' in res:
        return "No answer found"    
    return res

In [8]:
def get_first_senteces_from_sentences_list(sentences_list, nb_sentences):
    return "".join(sentences_list[0:nb_sentences])

In [9]:
def add_QA_location_to_df(df_location, subject, nb_sentences=9):
    question = "Where did the " + subject + " occur?"
    df_location["QA_location"] = None
    df_location["QA_location"] = df_location["Sentences"].apply(lambda x : answer_from_question(question, get_first_senteces_from_sentences_list(x,nb_sentences)))
    return df_location

In [10]:
def add_QA_impact_to_df(df_location, subject, nb_sentences=10):
    question = "What were the impact of the " + subject + " ?"
    df_location["QA_impact"] = None
    df_location["QA_impact"] = df_location["Sentences"].apply(lambda x : answer_from_question(question, get_first_senteces_from_sentences_list(x,nb_sentences)))
    return df_location

In [11]:
def add_QA_cause_to_df(df_location, subject, nb_sentences=10):
    question = "What caused the " + subject + " ?"
    df_location["QA_cause"] = None
    df_location["QA_cause"] = df_location["Sentences"].apply(lambda x : answer_from_question(question, get_first_senteces_from_sentences_list(x,nb_sentences)))
    return df_location

In [12]:
min_date = datetime(2011,1,1)
max_date = datetime(2012,1,1)
date_limits = (min_date, max_date)
subject = "tsunami"
nb_pages = 4
df_location = get_locations_df_from_subject(subject, nb_pages, date_limits)

df_location



Successfully scraped :  40  links
Done scraping
There are 15 usable articles


Unnamed: 0,Title,Link,Date,Clean_content,Sentences,Location
0,"California harbor 'destroyed' by tsunami, but ...",https://www.csmonitor.com/USA/2011/0311/Califo...,2011-03-11,"Loading... March 11, 2011 The tsunami waves u...","[Loading... March 11, 2011 The tsunami waves ...","[Japan, America, Crescent City, Calif., Oregon..."
1,Tsunami Causes Heavy Damage to Crescent City H...,https://www.nbcbayarea.com/news/local/tsunami-...,2011-03-11,The power of an 8.9 earthquake in Japan genera...,[The power of an 8.9 earthquake in Japan gener...,"[Japan, California, Santa Cruz, Crescent City,..."
2,Tsunamis in the United States have hit Crescen...,https://slate.com/news-and-politics/2011/03/ts...,2011-03-11,The tsunami capital of the continental United ...,[The tsunami capital of the continental United...,"[United States, Crescent City, Calif., Oregon,..."
3,"Hawaii County surveys tsunami damage, Kona hit...",https://www.bigislandvideonews.com/2011/03/11/...,2011-03-11,Media release | Hawaii County Office of the Ma...,[Media release | Hawaii County Office of the M...,"[the Hawaiian Islands, the County of, Japan, W..."
4,Gillard sees tsunami devastation first hand - ...,https://www.abc.net.au/news/2011-04-23/gillard...,2011-04-22,Prime Minister Julia Gillard has visited a coa...,[Prime Minister Julia Gillard has visited a co...,"[Japan, Minami Sanriku, Australia, Sydney, Min..."
5,"Hawaii says tsunami damage cost $30.6 million,...",https://www.bigislandvideonews.com/2011/03/24/...,2011-03-24,Media release | Hawaii Office of the Governor ...,[Media release | Hawaii Office of the Governor...,"[Honolulu, Hawaii Island, the Big Island, Hawa..."
6,\n Proposed GOP budget cuts target tsunami ...,https://www.cbsnews.com/news/proposed-gop-budg...,2011-03-11,"By Lucy Madison March 11,...","[By Lucy Madison March 11, 2011 / 4:...","[Japan, Hawaii]"
7,Midway's Albatrosses Survive the Tsunami | WIRED,https://www.wired.com/2011/03/albatrosses-tsun...,2011-03-15,"Brandon Keim To revist this article, visit My ...","[Brandon Keim To revist this article, visit My...","[Honolulu, Tokyo, U.S., Alaska, Leary]"
8,"Crescent City, Calif., Recovers From Its Own T...",https://www.nytimes.com/2011/03/17/us/17cresce...,2011-03-16,Advertisement Supported by Crescent City Journ...,[Advertisement Supported by Crescent City Jour...,"[CRESCENT CITY, Calif., Crescent City, Tsunami..."
9,Oregon Coast tsunami: Serious damage reports f...,https://www.oregonlive.com/pacific-northwest-n...,2011-03-11,Reports of serious damage are coming from the ...,[Reports of serious damage are coming from the...,"[Japan, Tsunami, Oregon, Curry County, Gold Be..."


In [13]:
nb_sentences = 9
df_location = add_QA_location_to_df(df_location, subject, nb_sentences)
df_location

Unnamed: 0,Title,Link,Date,Clean_content,Sentences,Location,QA_location
0,"California harbor 'destroyed' by tsunami, but ...",https://www.csmonitor.com/USA/2011/0311/Califo...,2011-03-11,"Loading... March 11, 2011 The tsunami waves u...","[Loading... March 11, 2011 The tsunami waves ...","[Japan, America, Crescent City, Calif., Oregon...",japan coast
1,Tsunami Causes Heavy Damage to Crescent City H...,https://www.nbcbayarea.com/news/local/tsunami-...,2011-03-11,The power of an 8.9 earthquake in Japan genera...,[The power of an 8.9 earthquake in Japan gener...,"[Japan, California, Santa Cruz, Crescent City,...",northern california coast
2,Tsunamis in the United States have hit Crescen...,https://slate.com/news-and-politics/2011/03/ts...,2011-03-11,The tsunami capital of the continental United ...,[The tsunami capital of the continental United...,"[United States, Crescent City, Calif., Oregon,...","crescent city , calif ."
3,"Hawaii County surveys tsunami damage, Kona hit...",https://www.bigislandvideonews.com/2011/03/11/...,2011-03-11,Media release | Hawaii County Office of the Ma...,[Media release | Hawaii County Office of the M...,"[the Hawaiian Islands, the County of, Japan, W...",the hawaiian islands
4,Gillard sees tsunami devastation first hand - ...,https://www.abc.net.au/news/2011-04-23/gillard...,2011-04-22,Prime Minister Julia Gillard has visited a coa...,[Prime Minister Julia Gillard has visited a co...,"[Japan, Minami Sanriku, Australia, Sydney, Min...",send
5,"Hawaii says tsunami damage cost $30.6 million,...",https://www.bigislandvideonews.com/2011/03/24/...,2011-03-24,Media release | Hawaii Office of the Governor ...,[Media release | Hawaii Office of the Governor...,"[Honolulu, Hawaii Island, the Big Island, Hawa...",the big island
6,\n Proposed GOP budget cuts target tsunami ...,https://www.cbsnews.com/news/proposed-gop-budg...,2011-03-11,"By Lucy Madison March 11,...","[By Lucy Madison March 11, 2011 / 4:...","[Japan, Hawaii]",japan
7,Midway's Albatrosses Survive the Tsunami | WIRED,https://www.wired.com/2011/03/albatrosses-tsun...,2011-03-15,"Brandon Keim To revist this article, visit My ...","[Brandon Keim To revist this article, visit My...","[Honolulu, Tokyo, U.S., Alaska, Leary]",midway
8,"Crescent City, Calif., Recovers From Its Own T...",https://www.nytimes.com/2011/03/17/us/17cresce...,2011-03-16,Advertisement Supported by Crescent City Journ...,[Advertisement Supported by Crescent City Jour...,"[CRESCENT CITY, Calif., Crescent City, Tsunami...",japan
9,Oregon Coast tsunami: Serious damage reports f...,https://www.oregonlive.com/pacific-northwest-n...,2011-03-11,Reports of serious damage are coming from the ...,[Reports of serious damage are coming from the...,"[Japan, Tsunami, Oregon, Curry County, Gold Be...",japan


In [14]:
nb_sentences = 9
df_location = add_QA_impact_to_df(df_location, subject, nb_sentences)
df_location

Unnamed: 0,Title,Link,Date,Clean_content,Sentences,Location,QA_location,QA_impact
0,"California harbor 'destroyed' by tsunami, but ...",https://www.csmonitor.com/USA/2011/0311/Califo...,2011-03-11,"Loading... March 11, 2011 The tsunami waves u...","[Loading... March 11, 2011 The tsunami waves ...","[Japan, America, Crescent City, Calif., Oregon...",japan coast,destroyed boats and docks
1,Tsunami Causes Heavy Damage to Crescent City H...,https://www.nbcbayarea.com/news/local/tsunami-...,2011-03-11,The power of an 8.9 earthquake in Japan genera...,[The power of an 8.9 earthquake in Japan gener...,"[Japan, California, Santa Cruz, Crescent City,...",northern california coast,"a wave at 8 : 44 a . m . , measured at 8 . 1 f..."
2,Tsunamis in the United States have hit Crescen...,https://slate.com/news-and-politics/2011/03/ts...,2011-03-11,The tsunami capital of the continental United ...,[The tsunami capital of the continental United...,"[United States, Crescent City, Calif., Oregon,...","crescent city , calif .",minimal
3,"Hawaii County surveys tsunami damage, Kona hit...",https://www.bigislandvideonews.com/2011/03/11/...,2011-03-11,Media release | Hawaii County Office of the Ma...,[Media release | Hawaii County Office of the M...,"[the Hawaiian Islands, the County of, Japan, W...",the hawaiian islands,millions of dollars
4,Gillard sees tsunami devastation first hand - ...,https://www.abc.net.au/news/2011-04-23/gillard...,2011-04-22,Prime Minister Julia Gillard has visited a coa...,[Prime Minister Julia Gillard has visited a co...,"[Japan, Minami Sanriku, Australia, Sydney, Min...",send,the tsunami claimed thousands of lives in this...
5,"Hawaii says tsunami damage cost $30.6 million,...",https://www.bigislandvideonews.com/2011/03/24/...,2011-03-24,Media release | Hawaii Office of the Governor ...,[Media release | Hawaii Office of the Governor...,"[Honolulu, Hawaii Island, the Big Island, Hawa...",the big island,No answer found
6,\n Proposed GOP budget cuts target tsunami ...,https://www.cbsnews.com/news/proposed-gop-budg...,2011-03-11,"By Lucy Madison March 11,...","[By Lucy Madison March 11, 2011 / 4:...","[Japan, Hawaii]",japan,massive tsunami along the nation east coast
7,Midway's Albatrosses Survive the Tsunami | WIRED,https://www.wired.com/2011/03/albatrosses-tsun...,2011-03-15,"Brandon Keim To revist this article, visit My ...","[Brandon Keim To revist this article, visit My...","[Honolulu, Tokyo, U.S., Alaska, Leary]",midway,nobody was hurt
8,"Crescent City, Calif., Recovers From Its Own T...",https://www.nytimes.com/2011/03/17/us/17cresce...,2011-03-16,Advertisement Supported by Crescent City Journ...,[Advertisement Supported by Crescent City Jour...,"[CRESCENT CITY, Calif., Crescent City, Tsunami...",japan,damage is believed to be in the tens of millio...
9,Oregon Coast tsunami: Serious damage reports f...,https://www.oregonlive.com/pacific-northwest-n...,2011-03-11,Reports of serious damage are coming from the ...,[Reports of serious damage are coming from the...,"[Japan, Tsunami, Oregon, Curry County, Gold Be...",japan,70 percent of the port commercial basin was de...
