In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
from selenium import webdriver

from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service

import spacy
import re
from spacy import displacy 
from spacy.matcher import Matcher 
import visualise_spacy_tree
from IPython.display import Image, display

from __future__ import unicode_literals, print_function
from spacy.lang.en import English # updated

import geopy 
import matplotlib.pyplot as plt
from geopy.extra.rate_limiter import RateLimiter

import folium
from folium.plugins import FastMarkerCluster
from folium.plugins import HeatMap

import numpy as np

from deep_translator import GoogleTranslator
from datetime import datetime, timedelta
import parsedatetime as pdt

import plotly.express as px

import geonamescache

In [14]:
#Scraping NBC news wild fires with Selenium and Beautifulsoup
def get_lists_from_subject(subject, num_pages,date_limits = None):

    translator = GoogleTranslator(source='fr', target='en')
    
    cal = pdt.Calendar()
    now = datetime.now()

    PATH = "./chromedriver_win32/chromedriver.exe"

    s=Service(PATH)
    driver = webdriver.Chrome(service=s)

    link_list = []
    date_list = []
    if date_limits is not None:
        lower_date, higher_date = date_limits
        ld, lm, ly = str(lower_date.day), str(lower_date.month), str(lower_date.year)
        hd, hm, hy = str(higher_date.day), str(higher_date.month), str(higher_date.year)
        driver.get("https://www.google.com/search?q="+subject+"&rlz=1C1CHBF_frFR863FR863&biw=1920&bih=880&sxsrf=APq-WBuYthkpiHNrhk_0YwH1w70zP27Xgg%3A1643812260630&source=lnt&tbs=cdr%3A1%2Ccd_min%3A"+lm+"%2F"+ld+"%2F"+ly+"%2Ccd_max%3A"+hm+"%2F"+hd+"%2F"+hy+"&tbm=nws&hl=en")
    else:   
        driver.get("https://www.google.com/search?q="+subject+"&rlz=1C1CHBF_frFR863FR863&biw=1920&bih=880&sxsrf=AOaemvI0XcPZB9YWw9GUVGwWTEXPDVqRxQ:1638967714934&source=lnms&tbm=nws&sa=X&ved=2ahUKEwjGlsnDntT0AhWTTcAKHeyuDk4Q_AUoAXoECAEQAw&hl=en")

    driver.find_element(By.XPATH, "//button[@class='VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc qfvgSe']").click() #accept google policy

    for i in range(num_pages):
        time.sleep(0.5)
        if i != 0:
            try :
                driver.find_element(By.ID, "pnnext").click()
            except :
                break

        html_source = driver.page_source

        soup = BeautifulSoup(html_source, 'lxml')

        #Getting all g-card 
        g_card_list = soup.find_all("g-card")

        for g_card in g_card_list:
            a = g_card.find("a")
            link = a['href']
            link_list.append(link)

            date = g_card.find_all("span")[-1].text
            translated_date = translator.translate(date)
            date_list.append(cal.parseDT(translated_date, now)[0].date())

    driver.quit()

    print("Successfully scraped : ", len(link_list), " links")

    return link_list, date_list



In [15]:
#Now that we have the link list, for each link try to scrape the article if there is one and the date if there is one.
def get_df_from_link_list(link_list, date_list):

    my_timeout = 10

    data = []

    for i, link in enumerate(link_list):
        d = {}

        try:
            # html_text = requests.get(link, timeout=my_timeout).text

            # soup = BeautifulSoup(html_text, 'lxml')

            # title = soup.find('title')
            # if title != None:
            #     d["Title"] = title.text

            d["Link"] = link
   


            d["Date"] = date_list[i]
            
            # article = soup.find('article')
            # if article != None:
            #     paragraphs = article.find_all('p')
            #     big_p = ""
            #     for p in paragraphs:
            #         big_p = big_p + p.text + " "
                
            #     if big_p != "":
            #         d["Content"] = unicodedata.normalize("NFKD", big_p).rstrip()
        except: #Requests takes way too long or bug
            print('Could not scrap page number ' + str(i) + ', try again another time.')

        data.append(d)

    return pd.DataFrame(data)


def get_df_from_subject(subject,num_pages, date_limits = None):
    link_list, date_list = get_lists_from_subject(subject, num_pages, date_limits)
    return get_df_from_link_list(link_list, date_list)

def plot_articles_per_date(df_location):
    s = df_location['Date'].value_counts().sort_index()
    dates, count = list(s.index), list(s)
    min_date = min(dates)
    max_date = max(dates)
    dateList = []
    for x in range (0, (max_date-min_date).days):
        dateList.append(min_date + timedelta(days = x))
    countList = [0 for i in range(len(dateList))]
    for i in range(len(dateList)):
        if dateList[i] in dates:
            countList[i] = count[dates.index(dateList[i])]
    while countList[-1] == 0:
        countList.pop()
        dateList.pop()

    df_temp = pd.DataFrame({"Date" : dateList, "Count": countList})
    return px.line(df_temp, x= "Date", y = "Count", title = "Number of scraped articles per date")
    

In [16]:
date1 = '2021-01-02'
date2 = '2021-01-31'
mydates = pd.date_range(date1, date2).tolist()

min_date = datetime(2021,1,1)
max_date = datetime(2021,1,1)
date_limits = (min_date, max_date)
subject = "wildfire"
num_pages = 100
df_location = get_df_from_subject(subject,num_pages, date_limits)
for date in mydates:
    date_limits = (date,date)
    df_location_to_add = get_df_from_subject(subject,num_pages, date_limits)
    df_location = pd.concat([df_location, df_location_to_add], ignore_index = True)
df_location

Successfully scraped :  0  links
Successfully scraped :  0  links


KeyboardInterrupt: 

In [None]:
fig = plot_articles_per_date(df_location)
fig.show()