In [28]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from streamlit_utils import *
from datetime import datetime
#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

# Apply the tokenizer to the input text, treating them as a text-pair.
def tokenize_input_text(question, input_text):

    input_ids = tokenizer.encode(question, input_text)

    return input_ids
def get_segment_ids(input_ids):
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    return segment_ids
def get_scores(input_ids,segment_ids):
    # Run through the model.
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                    token_type_ids=torch.tensor([segment_ids]),
                                    return_dict = False) # The segment IDs to differentiate question from answer_text
    return start_scores, end_scores
# Find the tokens with the highest `start` and `end` scores.
def get_answer(start_scores,end_scores, input_ids):
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Combine the tokens in the answer and print it out.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = ' '.join(tokens[answer_start:answer_end+1])

    return answer
# Better version

def get_answer_clean(start_scores, end_scores, input_ids):
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]
    return answer
def answer_from_question(question, input_text):
    input_ids = tokenize_input_text(question, input_text)
    segment_ids = get_segment_ids(input_ids)
    start_scores, end_scores = get_scores(input_ids, segment_ids)

    res = get_answer_clean(start_scores, end_scores, input_ids)

    if '[CLS]' in res:
        return "No answer found"    
    return res
def get_first_senteces_from_sentences_list(sentences_list, nb_sentences):
    res = "".join(sentences_list[0:nb_sentences])
    return res
def add_QA_location_to_df(df_location, subject, nb_sentences=9):
    question = "Where did the " + subject + " occur?"
    df_location["QA_location"] = None
    df_location["QA_location"] = df_location["Sentences"].apply(lambda x : answer_from_question(question, get_first_senteces_from_sentences_list(x,nb_sentences)))
    return df_location
def add_QA_impact_to_df(df_location, subject, nb_sentences=10):
    question = "What were the impact of the " + subject + " ?"
    df_location["QA_impact"] = None
    df_location["QA_impact"] = df_location["Sentences"].apply(lambda x : answer_from_question(question, get_first_senteces_from_sentences_list(x,nb_sentences)))
    return df_location
def add_QA_cause_to_df(df_location, subject, nb_sentences=10):
    question = "What caused the " + subject + " ?"
    df_location["QA_cause"] = None
    df_location["QA_cause"] = df_location["Sentences"].apply(lambda x : answer_from_question(question, get_first_senteces_from_sentences_list(x,nb_sentences)))
    return df_location



In [34]:
def evaluate_QA(df_location_QA):
    answer_list = df_location_QA["QA_location"]
    ner_list = df_location_QA["Location"]
    res = 0

    nlp = spacy.load('en_core_web_sm') #Loading english NLP model
            
    score = 0

    for i in range(len(ner_list)):
        ner_loc = ner_list[i]
        
        
        n = len(ner_loc)
        for j in range(n):
            doc = nlp(answer_list[i])
            for ent in doc.ents:
                if ent.label_ == 'GPE':
                    if ent.text == ner_loc[j].lower():
                        
                        score += 1

    score = score/len(answer_list)
    
    return score



In [30]:
min_date = datetime(2020,1,1)
max_date = datetime(2021,1,1)
date_limits = (min_date, max_date)
subject = "wildfire"
nb_pages = 1


df_location = get_locations_df_from_subject(subject, nb_pages, date_limits)


nb_sentences = 9
df_location = add_QA_location_to_df(df_location, subject, nb_sentences)
df_location



Successfully scraped :  10  links
Could not scrap page number 8, try again another time.
Done scraping
There are 5 usable articles


Unnamed: 0,Title,Link,Date,Clean_content,Sentences,Location,QA_location
0,How climate change-driven wildfires are changi...,https://www.cnn.com/2020/08/24/weather/califor...,2020-08-24,"By Ray Sanchez, Brandon Miller and Judson Jone...","[By Ray Sanchez, Brandon Miller and Judson Jon...",[],the golden state
1,California wildfires 2020: Why the current fir...,https://www.vox.com/2020/8/21/21377181/califor...,2020-08-26,"Dry lightning, extreme heat, and Covid 19 are ...","[Dry lightning, extreme heat, and Covid 19 are...","[California, San Francisco, Santa Clara, Alame...",southern san francisco bay area
2,Gender-reveal party culprit in massive Califor...,https://globalnews.ca/news/7320113/gender-reve...,2020-09-07,A firework at a gender reveal party triggered ...,[A firework at a gender reveal party triggered...,"[California, Yucaipa, Los Angeles, Mountain Ho...",southern california
3,Gender reveal party sparks large Californian w...,https://www.abc.net.au/news/2020-09-08/gender-...,2020-09-07,On Saturday morning an expectant couple trekke...,[On Saturday morning an expectant couple trekk...,"[California, Yucaipa, US, Arizona, Dickey, Kno...",el ranch dorado park
4,Australia wildfires: Here's what you need to k...,https://www.cnn.com/2020/01/01/australia/austr...,2020-01-13,"By Jessie Yeung, CNN Updated 0202 GMT Januar...","[By Jessie Yeung, CNN Updated 0202 GMT Janua...",[Australia],australia


In [None]:
print(evaluate_QA(df_location))

indonesia west java
indonesia sumatra
indonesia indonesia
indonesia jakarta
indonesia rome
indonesia atlanta
indonesia georgia
indonesia indonesia
indonesia tarpaulin
indonesia indonesia
indonesia new zealand
indonesia sulawesi
indonesia palang
cigondong jakarta
jakarta jakarta
indonesia jakarta
cigondong indonesia
jakarta indonesia
indonesia indonesia
cigondong jakarta
jakarta jakarta
indonesia jakarta
cigondong the sunda strait
jakarta the sunda strait
indonesia the sunda strait
cigondong japan
jakarta japan
indonesia japan
cigondong krakatoa
jakarta krakatoa
indonesia krakatoa
cigondong philippines
jakarta philippines
indonesia philippines
cigondong the united states
jakarta the united states
indonesia the united states
cigondong alaska
jakarta alaska
indonesia alaska
cigondong hawaii
jakarta hawaii
indonesia hawaii
cigondong washington
jakarta washington
indonesia washington
cigondong cigondong
jakarta cigondong
indonesia cigondong
cigondong krakatau
jakarta krakatau
indonesia krak

In [40]:
subject_list = ["wildfire", "tsunami", "earthquake"]

score_dict = {}

min_date = datetime(2021,1,1)
max_date = datetime(2022,1,1)
date_limits = (min_date, max_date)
nb_pages = 10
nb_sentences = 5

for subject in subject_list:
    
    df_location = get_locations_df_from_subject(subject, nb_pages, date_limits)
   
    df_location = add_QA_location_to_df(df_location, subject, nb_sentences)
    success = True
        

    
    score_dict[subject] = evaluate_QA(df_location)

Successfully scraped :  100  links
Could not scrap page number 23, try again another time.
Could not scrap page number 46, try again another time.
Done scraping
There are 64 usable articles
Successfully scraped :  100  links
Could not scrap page number 2, try again another time.
Could not scrap page number 3, try again another time.
Done scraping
There are 58 usable articles
Successfully scraped :  100  links
Could not scrap page number 26, try again another time.
Could not scrap page number 42, try again another time.
Could not scrap page number 93, try again another time.
Done scraping
There are 57 usable articles


In [41]:
score_dict

{'wildfire': 0.515625,
 'tsunami': 0.5344827586206896,
 'earthquake': 0.42105263157894735}