# Most Likely State Extraction

In [2]:
import pandas as pd
from goose3 import Goose
from goose3.configuration import Configuration

import spacy
from spacy import displacy

import geopy 
import matplotlib.pyplot as plt
from geopy.extra.rate_limiter import RateLimiter
import folium
from folium.plugins import FastMarkerCluster   
import numpy as np
from PIL import Image

import geonamescache
import time

from deep_translator import GoogleTranslator
from datetime import datetime, timedelta
import parsedatetime as pdt
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
from selenium import webdriver
import re

In [14]:
def loc_to_state(city, geocode, locator):
        code = geocode(city)
        lat = code[1][0]
        long = code[1][1]
        location = locator.reverse(str(lat)+","+str(long))
        try :
            return location.raw['address']['state']
        except : 
            return location.raw['address']['city']

def get_most_likely_state(text, nlp, list_countries, list_cities, list_states, geocode, locator, city2state = {}, decay = 0.995, print_scores = False) :
    doc = nlp(text)
    locations = [(ent.text, ent.start) for ent in doc.ents if ent.label_ in ['GPE']]
    loc_types = []
    for loc_ in locations :
        loc = loc_[0]
        weight = decay**loc_[1]
        if loc in list_states :
            type_loc = 'State'
        elif loc in list_cities :
            type_loc = 'City'
        elif loc in list_countries :
            type_loc = 'Country'
        else : 
            type_loc = 'Else'
        loc_types.append((loc, type_loc, weight))

    states_scores = {}
    for loc in loc_types :
        if loc[1] == 'State' :
            if loc[0] in states_scores.keys():
                states_scores[loc[0]] += loc[2]
            else :
                states_scores[loc[0]] = loc[2]
        elif loc[1] != 'Country':
            if loc[0] in city2state.keys():
                state = city2state[loc[0]]
            else :
                state = loc_to_state(loc[0], geocode, locator)
                city2state[loc[0]] = state
            if state in states_scores.keys():
                states_scores[state] += loc[2]
            else :
                states_scores[state] = loc[2]
                
    if states_scores == {}:
        return None, city2state
    #print(states_scores)
    return sorted(states_scores, key=states_scores.__getitem__, reverse=True)[0], city2state

def get_states(articles, decay = 0.99, print_ = False):
    nlp = spacy.load('en_core_web_sm')
    gc = geonamescache.GeonamesCache()
    countries = gc.get_countries()
    list_countries = [x['name'] for x in countries.values()]
    cities = gc.get_cities()
    list_cities = [x['name'] for x in cities.values()]
    states = gc.get_us_states()
    list_states = [x['name'] for x in states.values()]
    locator = geopy.geocoders.Nominatim(user_agent='mygeocoder')
    geocode = RateLimiter(locator.geocode, min_delay_seconds=1) 
    city2state = {}
    states = []
    for article in articles :
        t0 = time.time()
        state, city2state = get_most_likely_state(article, nlp, list_countries, list_cities, list_states, geocode, locator, city2state, decay)
        states.append(state)
        if print_ :
            print('Article :', article)
            print('Most Likely State :', state)
            print('Running time (s) :', round(time.time() - t0, 2))
            print('###')
            print('')
    return states, city2state

In [4]:
text1 = 'This text mentions Portland, Washington-DC and Boston. The action takes place in Salem. It also mentions California and France.'
text2 = 'This text mentions Portland, Washington-DC and Boston. The action takes place in Salem. It also mentions California and France.'
text3 = ''
text4 = 'This article does not mention any particular locations. It is a technical financial report.'
text5 = 'This article mentions a storm close to Sochaux and Besançon. The article was written in Paris'
text6 = "A fire occured in Los Angeles last night, it is the biggest of California's history according to the Boston police"
text7 = "The police spotted a fire in Reno. A team of experts, from Phoenix, was sent to inspect the location."
text8 = "The police spotted a fire in Phoenix. A team of experts, from Reno, was sent to inspect the location."

list_texts = [text1, text2, text3, text4, text5, text6, text7, text8]
states, city2state = get_states(list_texts, print_ = True)

Article : This text mentions Portland, Washington-DC and Boston. The action takes place in Salem. It also mentions California and France.
Most Likely State : Oregon
Running time (s) : 3.93
###

Article : This text mentions Portland, Washington-DC and Boston. The action takes place in Salem. It also mentions California and France.
Most Likely State : Oregon
Running time (s) : 0.01
###

Article : 
Most Likely State : None
Running time (s) : 0.0
###

Article : This article does not mention any particular locations. It is a technical financial report.
Most Likely State : None
Running time (s) : 0.01
###

Article : This article mentions a storm close to Sochaux and Besançon. The article was written in Paris
Most Likely State : Bourgogne-Franche-Comté
Running time (s) : 2.95
###

Article : A fire occured in Los Angeles last night, it is the biggest of California's history according to the Boston police
Most Likely State : California
Running time (s) : 1.05
###

Article : The police spotted a

## Measure accuracy

In [5]:
def get_lists_from_subject(subject, num_pages,date_limits = None):

    translator = GoogleTranslator(source='fr', target='en')
    
    cal = pdt.Calendar()
    now = datetime.now()

    PATH = "./chromedriver.exe"

    s=Service(PATH)
    driver = webdriver.Chrome(service=s)

    link_list = []
    date_list = []
    if date_limits is not None:
        lower_date, higher_date = date_limits
        ld, lm, ly = str(lower_date.day), str(lower_date.month), str(lower_date.year)
        hd, hm, hy = str(higher_date.day), str(higher_date.month), str(higher_date.year)
        driver.get("https://www.google.com/search?q="+subject+"&rlz=1C1CHBF_frFR863FR863&biw=1920&bih=880&sxsrf=APq-WBuYthkpiHNrhk_0YwH1w70zP27Xgg%3A1643812260630&source=lnt&tbs=cdr%3A1%2Ccd_min%3A"+lm+"%2F"+ld+"%2F"+ly+"%2Ccd_max%3A"+hm+"%2F"+hd+"%2F"+hy+"&tbm=nws")
    else:   
        driver.get("https://www.google.com/search?q="+subject+"&rlz=1C1CHBF_frFR863FR863&biw=1920&bih=880&sxsrf=AOaemvI0XcPZB9YWw9GUVGwWTEXPDVqRxQ:1638967714934&source=lnms&tbm=nws&sa=X&ved=2ahUKEwjGlsnDntT0AhWTTcAKHeyuDk4Q_AUoAXoECAEQAw")

    driver.find_element(By.XPATH, "//button[@class='VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc qfvgSe']").click() #accept google policy

    for i in range(num_pages):
        if i != 0:
            try :
                driver.find_element(By.ID, "pnnext").click()
            except :
                break

        html_source = driver.page_source

        soup = BeautifulSoup(html_source, 'lxml')

        #Getting all g-card 
        g_card_list = soup.find_all("g-card")

        for g_card in g_card_list:
            a = g_card.find("a")
            link = a['href']
            link_list.append(link)

            date = g_card.find_all("span")[-1].text
            translated_date = translator.translate(date)
            date_list.append(cal.parseDT(translated_date, now)[0].date())

    driver.quit()

    print("Successfully scraped : ", len(link_list), " links")

    return link_list, date_list

def get_df_from_link_list(link_list, date_list):

    my_timeout = 10

    data = []

    for i, link in enumerate(link_list):
        d = {}

        try:
            html_text = requests.get(link, timeout=my_timeout).text

            soup = BeautifulSoup(html_text, 'lxml')

            title = soup.find('title')
            if title != None:
                d["Title"] = title.text

            d["Link"] = link
   


            d["Date"] = date_list[i]
            
            article = soup.find('article')
            if article != None:
                paragraphs = article.find_all('p')
                big_p = ""
                for p in paragraphs:
                    big_p = big_p + p.text + " "
                
                if big_p != "":
                    d["Content"] = unicodedata.normalize("NFKD", big_p).rstrip()
        except: #Requests takes way too long or bug
            print('Could not scrap page number ' + str(i) + ', try again another time.')

        data.append(d)

    return pd.DataFrame(data)

def clean(text):

    """Clean the text input"""
    
    # removing paragraph numbers
    text = re.sub('[0-9]+.\t','',str(text))
    # removing new line characters
    text = re.sub('\n ','',str(text))
    text = re.sub('\n',' ',str(text))
    # removing apostrophes
    text = re.sub("'s",'',str(text))
    # removing hyphens
    text = re.sub("-",' ',str(text))
    text = re.sub("— ",'',str(text))
    # removing quotation marks
    text = re.sub('\"','',str(text))
    # removing salutations
    text = re.sub("Mr\.",'Mr',str(text))
    text = re.sub("Mrs\.",'Mrs',str(text))
    # removing any reference to outside text
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))

    return text

def get_df_from_subject(subject,num_pages, date_limits = None):
    link_list, date_list = get_lists_from_subject(subject, num_pages, date_limits)
    return get_df_from_link_list(link_list, date_list)

In [6]:
df = get_df_from_subject('Wildfire',4)
df = df.dropna() #Drop missing values
df["Content"] = df["Content"].apply(clean) #cleaning contents
df = df.rename(columns={"Content":"Clean_content"})
print(f'There are {len(df.index)} usable articles')

Successfully scraped :  40  links
There are 24 usable articles


In [7]:
df.head()

Unnamed: 0,Title,Link,Date,Clean_content
0,Spreading like Wildfire: The Rising Threat of ...,https://reliefweb.int/report/world/spreading-w...,2022-02-23,WorldNumber of wildfires to rise by 50% by 21...
1,Wildfires likely to increase by a third by 205...,https://www.theguardian.com/environment/2022/f...,2022-02-23,Even previously unaffected countries likely to...
2,Climate Change Could Increase Risk of Wildfire...,https://www.nytimes.com/2022/02/23/climate/cli...,2022-02-23,Advertisement Supported by Worsening heat and ...
5,British Columbia set to move to year-round BC ...,https://globalnews.ca/news/8635723/bc-wildfire...,2022-02-22,As wildfire seasons in British Columbia contin...
9,Sheriff's Office performing wildfire mitigatio...,https://www.gjsentinel.com/news/western_colora...,2022-02-23,The Mesa County Sheriff Office and Mesa Count...


In [15]:
scores = []

nlp = spacy.load('en_core_web_sm')
gc = geonamescache.GeonamesCache()
countries = gc.get_countries()
list_countries = [x['name'] for x in countries.values()]
cities = gc.get_cities()
list_cities = [x['name'] for x in cities.values()]
states = gc.get_us_states()
list_states = [x['name'] for x in states.values()]
locator = geopy.geocoders.Nominatim(user_agent='mygeocoder')
geocode = RateLimiter(locator.geocode, min_delay_seconds=1) 
city2state = {}
states = []
decay = 0.995

for i in range(len(df)):
    text = df.iloc[i].Clean_content
    print(text)
    state, city2state = get_most_likely_state(text, nlp, list_countries, list_cities, list_states, geocode, locator, city2state, decay)
    print()
    print('Guessed State :', state)
    x = input('Is it correct ? (1 or 0)')
    scores.append(int(x))
    print()
    print("###########")
    
print()
print('Accuracy :', round(np.mean(scores), 2))

 WorldNumber of wildfires to rise by 50% by 2100 and governments are not prepared, experts warn Nairobi, 23 February, 2022    Climate change and land use change are projected to make wildfires more frequent and intense, with a global increase of extreme fires of up to 14 per cent by 2030, 30 per cent by the end of 2050 and 50 per cent by the end of the century, according to a new report by the UN Environment Programme  and GRID Arendal. The paper calls for a radical change in government spending on wildfires, shifting their investments from reaction and response to prevention and preparedness. The report, Spreading like Wildfire: The Rising Threat of Extraordinary Landscape Fires, finds an elevated risk even for the Arctic and other regions previously unaffected by wildfires. The report is released before the resumed 5th session of the UN Environment Assembly  convenes in Nairobi, between 28 February and 2 March, 2022. The publication calls on governments to adopt a new 'Fire Ready For

Is it correct ? (1 or 0) 0



###########
Even previously unaffected countries likely to see uncontrollable blazes, says study, which calls for shift to spending on prevention Last modified on Wed 23 Feb 2022 13.50 GMT Wildfires that have devastated California, Australia and Siberia will become 50% more common by the end of the century, according to a new report that warns of uncontrollable blazes ravaging previously unaffected parts of the planet. The escalating climate crisis and land use change are driving a global increase in extreme wildfires, with a 14% increase predicted by 2030 and a 30% increase by 2050, according to a UN report involving more than 50 international researchers.  The findings suggest there should be a radical change in public spending on wildfires. The report said governments were putting their money in the wrong place by focusing on the work of emergency services when preventing fires would be a more effective approach. Wildfires are becoming an expected part of life on every continent, e

Is it correct ? (1 or 0) 0



###########

Guessed State : 西藏自治区


Is it correct ? (1 or 0) 0



###########
As wildfire seasons in British Columbia continue to start sooner and last longer the province is significantly altering the BC Wildfire Service. Finance Minister Selina Robinson announced on Monday the provincial budget would include the significant funding shift. Robinson is set to present the 2022 BC Budget on Tuesday afternoon.  Read more:   				2021 fire season ‘tremendously challenging,’ but not worst on record: BC Wildfire Service report				Additional details, including how much funding will be earmarked for the move and how many staff will be hired, will be in the budget. “It is a fundamental change, a real shift, that will see B.C. move from a reactive to a proactive approach,” Robinson said. “It will ensure that B.C. Wildfire Service has trained staff able to both prepare for and respond to wildfires.” The BC General Employees Union  has been calling on the province to make more permanent positions within the BC Wildfire Service. The jobs would allow for more time

KeyboardInterrupt: Interrupted by user

In [1]:
scores

NameError: name 'scores' is not defined