In [1]:
import json

In [2]:
import base64
import trafilatura

In [3]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse
from urllib.parse import urlparse

from requests_html import HTML
from requests_html import HTMLSession
import pandas as pd

In [4]:
from htmldate import find_date

In [68]:
from bs4 import BeautifulSoup
import requests, urllib.parse
import lxml

def get_extracted_data_from_url(url):

    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
    }
    response = requests.get(url, headers=headers).text

    soup = BeautifulSoup(response, 'lxml')
    articles = []
    for container in soup.findAll('div', class_='tF2Cxc'):
        article = {}
        article['title'] = container.find('h3', class_='LC20lb DKV0Md').text
        try:
            article['datesum'] = container.find('span', class_='f').text
        except AttributeError:
            pass
        article['link'] = container.a['href']
        articles.append(article)

    return articles, soup.select_one('a#pnnext')


def scrape(query, next_page_node):
    
    if next_page_node is None:
        articles, next_page_node = get_extracted_data_from_url(
            'https://google.com/search?hl=en-US&q={}'.format(query))
        data = articles
    else:
        data = []
    
    for i in range(15):
        # scraping breaks beyond 15 pages at a time
        try:
            next_page_url = urllib.parse.urljoin('https://www.google.com', next_page_node['href'])
            articles, next_page_node = get_extracted_data_from_url(next_page_url)
            data = data+articles
        except TypeError:
            print('scraping stopped at {}th page'.format(i))
            break
    return data, next_page_node

In [37]:
def get_source(url):
    """Return the source code for the provided URL. 

    Args: 
        url (string): URL of the page to scrape.

    Returns:
        response (object): HTTP response object from requests_html. 
    """

    try:
        session = HTMLSession()
        response = session.get(url)
        return response

    except requests.exceptions.RequestException as e:
        print(e)

In [38]:
# this seems to work for all, which is nice but need to get metadata as well

def beautifulsoup_extract_text_fallback(response_content):
    
    '''
    This is a fallback function, so that we can always return a value for text content.
    Even for when both Trafilatura and BeautifulSoup are unable to extract the text from a 
    single URL.
    '''
    
    # Create the beautifulsoup object:
    soup = BeautifulSoup(response_content, 'html.parser')
    
    # Finding the text:
    text = soup.find_all(text=True)
    
    # Remove unwanted tag elements:
    cleaned_text = ''
    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head', 
        'input',
        'script',
        'style',]

    # Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag
    # is NOT in the blacklist
    for item in text:
        if item.parent.name not in blacklist:
            cleaned_text += '{} '.format(item)
            
    # Remove any tab separation and strip the text:
    cleaned_text = cleaned_text.replace('\t', '')
    return cleaned_text.strip()
    

def extract_text_from_single_web_page(url):
    
    downloaded_url = trafilatura.fetch_url(url)
    try:
        a = trafilatura.extract(downloaded_url, output_format='json', with_metadata=True, include_comments = False,
                            date_extraction_params={'extensive_search': True, 'original_date': True})
    except AttributeError:
        a = trafilatura.extract(downloaded_url, output_format='json', with_metadata=True,
                            date_extraction_params={'extensive_search': True, 'original_date': True})
    if a:
        json_output = json.loads(a)
        return json_output['text']
        #return json_output
    else:
        try:
            resp = requests.get(url)
            # We will only extract the text from successful requests:
            if resp.status_code == 200:
                return beautifulsoup_extract_text_fallback(resp.content)
            else:
                # This line will handle for any failures in both the Trafilature and BeautifulSoup4 functions:
                return np.nan
        # Handling for any URLs that don't have the correct protocol
        except MissingSchema:
            return np.nan

In [135]:
def get_text_frm_url(x):
    #ans = get_source(x)
    #page_html = ans.content.decode("utf-8")
    #soup = BeautifulSoup(page_html, 'html.parser')
    try:
        text = extract_text_from_single_web_page(url=x)
    except e:
        text = None
        print(x)
        print(e)
    return text

In [40]:
def get_html_date(x):
    try:
        date = find_date(x)
        return date
    except:
        return None

In [41]:
#if its an important media cite it is likely to return these
def get_more_meta(df):
    
    soup = BeautifulSoup(df['link'])
    try:
        data = json.loads(soup.find('script', type='application/ld+json').text)
        datePublished = data['datePublished']
        publisher = data['publisher']
        author = data['author']
        
    except:
        datePublished = 'unknown'
        publisher = 'unknown'
        author = 'unknown'
    df['datePublished'] = datePublished
    df['publisher'] = publisher
    df['author'] = author
    
    return df

# Get Google Search URLs

We dont want to scrape too many at the same time, so what we will do is 10 pages at a time and process them inbetween (fetch the article content) before we scrape google again for the next 10 pages

In [88]:
scraped, next_page_node = scrape('Benin Bronzes Restitution', None)

In [96]:
scraped2, next_page_node = scrape('Benin Bronzes Restitution', next_page_node)

In [97]:
scraped = scraped + scraped2

In [81]:
df = pd.DataFrame(scraped)

## Check for duplicate article links in the scrape and remove them

In [82]:
df.shape

(1760, 2)

In [83]:
deduped = df.drop_duplicates()

In [84]:
deduped.shape

(172, 2)

We scraped lots of pages but a lot of the search results were the same article. If it were not the same article, the google search title and link to article would not be the same

In [87]:
#write to file so we dont lose them
deduped.to_csv("bb_google_titlelink172.csv")

# Get Article content and dates and extra info if possible

So when I tried to run the 140 result df, it broke and I gor nothing. So I am being extra careful running everything piece by piece so that it will work. First I divided the df into 9 bits (groups of 20). The first run did not take long at all for the zero df, will see how the others go

In [101]:
import numpy as np

In [48]:
df = deduped

In [103]:
n = 20
dfs = []
for g, df in deduped.groupby(np.arange(len(deduped)) // n):
    dfs.append(df)

In [113]:
len(dfs)

9

In [136]:
zero = dfs[6]
zero['text'] = zero['link'].apply(lambda x: get_text_frm_url(x))
zero['html_date'] = zero['link'].apply(lambda x: get_html_date(x))

In [138]:
one = dfs[7]
one['text'] = one['link'].apply(lambda x: get_text_frm_url(x))
one['html_date'] = one['link'].apply(lambda x: get_html_date(x))

In [139]:
two = dfs[8]
two['text'] = two['link'].apply(lambda x: get_text_frm_url(x))
two['html_date'] = two['link'].apply(lambda x: get_html_date(x))

In [141]:
final_df = pd.concat([zero, one, two])

In [144]:
final_df.to_csv('bb_google_content172.csv')

In [145]:
final_df

Unnamed: 0,title,link,text,date,html_date
0,Benin Bronzes: Germany to return looted artefa...,https://www.bbc.com/news/world-africa-56949003,Benin Bronzes: Germany to return looted artefa...,2021-04-30,2021-04-30
1,Fate of looted Benin Bronzes intensifies debat...,https://www.ft.com/content/ee754d39-e171-4c36-...,Subscribe to read | Financial Times Accessibil...,2021-03-26,2021-03-26
2,Nigeria welcomes Germany′s decision to return ...,https://www.dw.com/en/nigeria-welcomes-germany...,German cultural and political leaders have rea...,2017-07-11,2017-07-11
3,Effort to Return Benin Bronzes to Africa Remai...,https://www.voanews.com/europe/effort-return-b...,LONDON - The promise to return several Benin B...,2020-01-17,2020-01-17
4,Germany first to hand back Benin bronzes loote...,https://www.theguardian.com/world/2021/apr/30/...,Germany is to become the first country to hand...,2021-04-30,2021-04-30
...,...,...,...,...,...
1135,Review: 'Loot' Tells of the Benin Bronzes from...,https://www.artnews.com/art-news/news/loot-bar...,"In 1974, the Nigerian government asked the Bri...",,2021-05-03
1171,The Benin Bronzes: Towards the Resolution of a...,https://morningside-alliance.org/event/the-ben...,,,
1173,The week in art news – staff accuse Barbican C...,https://www.apollo-magazine.com/art-news-barbi...,Current and former staff at the Barbican Centr...,,2021-06-11
1178,Columbia University Italian Academy: The Benin...,https://medren.columbia.edu/news/columbia-univ...,The Benin Bronzes were looted in 1897 from the...,,2021-04-09


# Clean Article text and load to dataframe

In [4]:
import nltk
import pandas as pd
import CleanTweets as ct

In [9]:
df = pd.read_csv('bb_google_content172.csv', index_col=0)

In [12]:
df.head()

Unnamed: 0,title,link,text,date,html_date
0,Benin Bronzes: Germany to return looted artefa...,https://www.bbc.com/news/world-africa-56949003,Benin Bronzes: Germany to return looted artefa...,2021-04-30,2021-04-30
1,Fate of looted Benin Bronzes intensifies debat...,https://www.ft.com/content/ee754d39-e171-4c36-...,Subscribe to read | Financial Times Accessibil...,2021-03-26,2021-03-26
2,Nigeria welcomes Germany′s decision to return ...,https://www.dw.com/en/nigeria-welcomes-germany...,German cultural and political leaders have rea...,2017-07-11,2017-07-11
3,Effort to Return Benin Bronzes to Africa Remai...,https://www.voanews.com/europe/effort-return-b...,LONDON - The promise to return several Benin B...,2020-01-17,2020-01-17
4,Germany first to hand back Benin bronzes loote...,https://www.theguardian.com/world/2021/apr/30/...,Germany is to become the first country to hand...,2021-04-30,2021-04-30


In [15]:
df = ct.clean_tweets(df,'text','lemma_text',
                general_clean=True,lemma=True,stem=False,remove_tag=False,remove_mention=False,
                remove_emoji=False, remove_stopword=True,min_length=1, untokenized_return=True)

time taken to clean tweets: 20.657857179641724s. Use the [lemma_text] column to perform your analysis/modeling on


In [16]:
df = ct.clean_tweets(df,'text','stem_text',
                general_clean=True,lemma=False,stem=True,remove_tag=False,remove_mention=False,
                remove_emoji=False, remove_stopword=True,min_length=1, untokenized_return=True)

time taken to clean tweets: 3.1677749156951904s. Use the [stem_text] column to perform your analysis/modeling on


In [18]:
df.to_csv('bb_google_content172_cleaned.csv')

# Playground

In [87]:
def ordinary(soup):

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.body.get_text(separator=' ')

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    return text

In [131]:
def bcc_get_p(soup):
    # for BBC articles
    # retrieve all of the paragraph tags
    #paragraphs = soup.find('article').find_all('p') 
    paragraphs = soup.find_all('p') 
    text = ' '.join(paragraph.text for paragraph in paragraphs)
    headings = soup.find_all('h1')
    print(headings)
    text2 = ' '.join(heading.text for heading in headings)
    full_text = text + text2
    return full_text

In [120]:
# BCC and CNN
article = {}
article['publisher'] = data['publisher']
article['author'] = data['author']
article['url'] = data['url']
article['datePublished'] = data['datePublished']
article['text'] = ordinary(soup)

In [142]:
bcc_get_p(soup)

[<h1 class="page-header__title"><span>Effort to Return Benin Bronzes to Africa Remains Ongoing Challenge</span>
</h1>]


" LONDON - The promise to return several Benin Bronzes from three Western institutions to the former Kingdom of Benin in Nigeria was celebrated by many. But returning all the artifacts looted by British soldiers 125 years ago will continue to be a challenge. In recent weeks, a university in Scotland and museums in Germany and Britain pledged to repatriate the Benin Bronzes they own.\xa0\xa0\xa0 The restitution is hugely symbolic to Timothy Awoyemi, a British-born Nigerian who helped repatriate two Benin Bronzes in 2014 from a private collector whose grandfather had been part of the 19th century looting. Awoyemi says he was elated when he heard about the latest returns.\xa0 “They stole it so it makes me happy, because the stolen artifacts are going to be returned back to where they rightly belong,” he said.\xa0\xa0 British soldiers looted the Kingdom of Benin in what is currently Edo state in Nigeria, during a punitive military expedition in 1897. The high valued plaques, masks and scul

In [143]:
ordinary(soup)

"Skip to main content\nOpen main navigation\nLive TV\nFull Schedule\nAll Programs\nLive Radio\nFull Schedule\nAll Programs\nUnited States\nUS Politics\nImmigration\nAll About America\nScience & Health\nSilicon Valley & Technology\nWorld\nAfrica\nThe Americas\nEast Asia Pacific\nEurope\nExtremism Watch\nMiddle East\nSouth & Central Asia\nVOA News on China\nVOA News on Iran\nSections\nArts & Culture\nDay in Photos\nEconomy & Business\nPress Freedom\nPlugged in with Greta Van Susteren\nVOA StudentU\nVOA Connect\nRefugees\nFEATURED\nUS News\nPress Freedom\nVOA News on Iran\nCOVID-19 Pandemic\nSearch\nSearch VOA News\nSearch\nLang\nLive TV\nLive TV\nFull Schedule\nAll Programs\nLive Radio\nLive Radio\nFull Schedule\nAll Programs\nEnglish\nEnglish\nvoanews.com\nLearning English\nlearningenglish.voanews.com\nEastern & Central Europe\nShqip\nzeriamerikes.com\nBosanski\nba.voanews.com\nΕλληνικά\ngr.voanews.com\nМакедонски\nmk.voanews.com\nSrpski\nglasamerike.net\nУкраїнська\nukrainian.voanews.c

In [112]:
article

{'publisher': {'@type': 'Organization',
  'name': 'CNN',
  'logo': {'@type': 'ImageObject',
   'url': 'https://dynaimage.cdn.cnn.com/cnn/q_auto,h_60/https%3A%2F%2Fi2.cdn.turner.com%2Fcnn%2F2017%2Fimages%2F09%2F14%2Flogo-cnnstyle.png'}},
 'author': {'@type': 'Person', 'name': 'Kieron Monks, CNN'},
 'url': 'https://www.cnn.com/style/article/benin-bronzes-germany-restitution/index.html',
 'datePublished': '2021-04-30T03:03:12Z',
 'text': "Skip to main content\nOpen main navigation\nLive TV\nFull Schedule\nAll Programs\nLive Radio\nFull Schedule\nAll Programs\nUnited States\nUS Politics\nImmigration\nAll About America\nScience & Health\nSilicon Valley & Technology\nWorld\nAfrica\nThe Americas\nEast Asia Pacific\nEurope\nExtremism Watch\nMiddle East\nSouth & Central Asia\nVOA News on China\nVOA News on Iran\nSections\nArts & Culture\nDay in Photos\nEconomy & Business\nPress Freedom\nPlugged in with Greta Van Susteren\nVOA StudentU\nVOA Connect\nRefugees\nFEATURED\nUS News\nPress Freedom\nVO

My update:
    