## This notebook is for the testing of the notions from the book **Web Scraping with Python, 2nd Edition** by **Ryan Mitchell**

# Chapter 1: Your first web scrawler

In [None]:
# Importing necessary libraries for web scraping
from urllib.request import urlopen
from urllib.parse import urlparse
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup
import re
import datetime
import random
import requests

In [None]:
# Fetching and printing the HTML content of a webpage
html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [None]:
# Parsing HTML content using BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

<h1>An Interesting Title</h1>


In [None]:
# Function to fetch HTML content with error handling
def fetch_html(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        raise HTTPError('The server could not fulfill the request!')
    except URLError as e:
        raise URLError('The server could not be found!')
    else:
        return html
    
# Fetching and parsing HTML content
html = fetch_html('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

<h1>An Interesting Title</h1>


In [None]:
# Handling AttributeError when accessing non-existent tags
try:
    bs.nonExistentTag.anotherTag
except AttributeError as e:
    print('Tag was not found')
else:
    print('Tag was found')

# Beneficial for tag fetching case by case

Tag was not found


  bs.nonExistentTag.anotherTag


# Chapter 2: Advanced HTML Parsing

In [None]:
# Using findAll() method to extract specific elements
html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html.read(), 'html.parser')
nameList = bs.findAll('span', {'class':'green'})
for name in nameList:
    print(name.get_text())

print('-------------------')
print(bs.span)

Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna
-------------------
<span class="red">Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend, no longer
my 'faithful slave,' as you call yourself! But how 

  nameList = bs.findAll('span', {'class':'green'})


In [None]:
# Navigating the tree hierarchy of HTML elements
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html.read(), 'html.parser')
for child in bs.find('table', {'id':'giftList'}).children:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [None]:
# Navigating sibling elements in the HTML tree
# Finding the first <tr> element and iterating over its next siblings
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html.read(), 'html.parser')
for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [None]:
# Navigating parent and sibling elements
# Finding an image and navigating to its parent and sibling elements
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
print(bs.find('img',{'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())


$15.00



## Regular expressions

In [None]:
# Using regular expressions with BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img',
{'src':re.compile('\.\.\/img\/gifts/img.*\.jpg')})
for image in images: 
    print(image['src'])

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


## Other useful regular expressions:
- **email**: [A-Za-z0-9\._+]+@[A-Za-z]+\.(com|org|edu|net)

## Lambda expression

Example:  bs.find_all(lambda tag: len(tag.attrs) == 2)

# Chapter 3: Writing Web Crawlers

In [12]:
# The first step is to retrieve all the links in a page

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
    if 'href' in link.attrs:
        print(link.attrs['href']) 

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Special:SpecialPages
/wiki/Main_Page
/wiki/Special:Search
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
https://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en
/w/index.php?title=Special:CreateAccount&returnto=Kevin+Bacon
/w/index.php?title=Special:UserLogin&returnto=Kevin+Bacon
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#Early_life_and_education
#Acting_career
#Early_work
#1980s
#1990s
#2000s
#2010s
#Other_ventu

In [None]:
# In order to determine if a link is an article link or not we need to define 
# the specifics of the link then use regular/lambda expressions to collect it

html = urlopen('http://en.wikipedia.org/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')

# The article links are in the div with the id bodyContent -- find('div', {'id':'bodyContent'})
# The links are in the href attribute of the a tag -- find_all('a', href=re.compile())
# The link doesn't contain a colon -- ((?!:).)*
# The links start with /wiki/ -- ^(/wiki/)
for link in bs.find('div', {'id':'bodyContent'}).find_all('a', href=re.compile('^(/wiki/)((?!:).)*$')):
    if 'href' in link.attrs:
        print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Leading_man
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/National_Lampoon%27s_Animal_House
/wiki/Diner_(1982_film)
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Frost/Nixon_(film)
/wiki/Friday_the_13th_(1980_film)
/wiki/Tremors_(1990_film)
/wiki/The_River_Wild
/wiki/Balto_(film)
/wiki/The_Woodsman_(2004_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Patriots_Day_(film)
/wiki/Losing_Chase
/wiki/Loverboy_(2005_film)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Miniseries_or_Television_Film
/wiki/Screen_Actors_Guild_Award_for_Outstanding_Performance_by_a_Male_Actor_in_a_Miniseries_or_Television_Movie
/wiki/Michael_Strobl
/wiki/HBO
/wiki/Taking_Chance
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/Amazon_P

In [14]:
# Now to be more robust we need to add a function that will collect all the links in a page
def getLinks(articleUrl):
    html = urlopen('http://en.wikipedia.org{}'.format(articleUrl))
    bs = BeautifulSoup(html, 'html.parser')
    return bs.find('div', {'id':'bodyContent'}).find_all('a',href=re.compile('^(/wiki/)((?!:).)*$'))

In [15]:
# Setting up the random walk
random.seed(datetime.datetime.now().timestamp())

# Collecting the links from the Kevin Bacon page
links = getLinks('/wiki/Kevin_Bacon')

# Iterating over the links and printing them
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs['href']
    print(newArticle)
    links = getLinks(newArticle)

/wiki/Fox_Broadcasting_Company
/wiki/We_Are_Family_(TV_series)
/wiki/The_Shirelles
/wiki/RCA_Records
/wiki/The_Orchard_(company)
/wiki/RCA_Red_Seal_Records
/wiki/Opera


KeyboardInterrupt: 

In [16]:
# To avoid entering a loop of links we can store all the accessed link in a set
pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('')

/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
/wiki/Help:Contents
/wiki/Help:Introduction


KeyboardInterrupt: 

## Getting every element together and working the first crawler

In [17]:
pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen('http://en.wikipedia.org{}'.format(pageUrl))
    bs = BeautifulSoup(html, 'html.parser')
    try:
        print(bs.h1.get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[0])
        print(bs.find(id='ca-edit').find('span')
             .find('a').attrs['href'])
    except AttributeError:
        print('This page is missing something! Continuing.')
    
    for link in bs.find_all('a', href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                #We have encountered a new page
                newPage = link.attrs['href']
                print('-'*20)
                print(newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks('') 

Main Page
<p><i><b><a href="/wiki/What_a_Merry-Go-Round" title="What a Merry-Go-Round">What a Merry-Go-Round</a></b></i> is the eighteenth collection by British fashion designer <a href="/wiki/Alexander_McQueen" title="Alexander McQueen">Alexander McQueen</a>, made for the Autumn/Winter 2001 season of his <a href="/wiki/Alexander_McQueen_(fashion_house)" title="Alexander McQueen (fashion house)">eponymous fashion house</a>. The collection drew on imagery of clowns and carnivals, inspired by McQueen's feelings about childhood and his experiences in the fashion industry. The designs were influenced by <a href="/wiki/List_of_chics#Military_chic" title="List of chics">military chic</a>, cinema such as <i><a href="/wiki/Nosferatu" title="Nosferatu">Nosferatu</a></i> (1922) and <i><a href="/wiki/Cabaret_(1972_film)" title="Cabaret (1972 film)">Cabaret</a></i> (1972), 1920s <a href="/wiki/Flapper" title="Flapper">flapper fashion</a> and the <a href="/wiki/French_Revolution" title="French Revo

KeyboardInterrupt: 

In [18]:
# To handle redirects: r = requests.get('http://github.com', allow_redirects=True)

#Retrieves a list of all Internal links found on a page
def getInternalLinks(bs, includeUrl):
    includeUrl = '{}://{}'.format(urlparse(includeUrl).scheme,
        urlparse(includeUrl).netloc)
    internalLinks = []
    #Finds all links that begin with a "/"
    for link in bs.find_all('a',
        href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(
                        includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks

In [19]:
#Retrieves a list of all external links found on a page
def getExternalLinks(bs, excludeUrl):
    externalLinks = []
    #Finds all links that start with "http" that do
    #not contain the current URL
    for link in bs.find_all('a',
        href=re.compile('^(http|www)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

In [20]:
def getRandomExternalLink(startingPage):
    html = urlopen(startingPage)
    bs = BeautifulSoup(html, 'html.parser')
    externalLinks = getExternalLinks(bs,
        urlparse(startingPage).netloc)
    if len(externalLinks) == 0:
        print('No external links, looking around the site for one')
        domain = '{}://{}'.format(urlparse(startingPage).scheme,
            urlparse(startingPage).netloc)
        internalLinks = getInternalLinks(bs, domain)
        return getRandomExternalLink(internalLinks[random.randint(0,
                                    len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0, len(externalLinks)-1)]

In [21]:
def followExternalOnly(startingSite):
    externalLink = getRandomExternalLink(startingSite)
    print('Random external link is: {}'.format(externalLink))
    followExternalOnly(externalLink)

## Those function create the foundation for any crawler application and strategy

In [22]:
 # Collects a list of all external URLs found on the site
allExtLinks = set()
allIntLinks = set()
def getAllExternalLinks(siteUrl):
    html = urlopen(siteUrl)
    domain = '{}://{}'.format(urlparse(siteUrl).scheme,
        urlparse(siteUrl).netloc)
    bs = BeautifulSoup(html, 'html.parser')
    internalLinks = getInternalLinks(bs, domain)
    externalLinks = getExternalLinks(bs, domain)
    for link in externalLinks:
        if link not in allExtLinks:
            allExtLinks.add(link)
            print(link)
    for link in internalLinks:
        if link not in allIntLinks:
            allIntLinks.add(link)
            getAllExternalLinks(link)
allIntLinks.add('http://oreilly.com')
getAllExternalLinks('http://oreilly.com')

https://www.oreilly.com
https://www.oreilly.com/member/login/
https://www.oreilly.com/online-learning/try-now.html
https://www.oreilly.com/online-learning/teams.html
https://www.oreilly.com/online-learning/government.html
https://www.oreilly.com/online-learning/academic.html
https://www.oreilly.com/online-learning/individuals.html
https://www.oreilly.com/online-learning/features.html
https://www.oreilly.com/online-learning/courses.html
https://www.oreilly.com/online-learning/feature-certification.html
https://www.oreilly.com/online-learning/intro-interactive-learning.html
https://www.oreilly.com/online-learning/live-events.html
https://www.oreilly.com/online-learning/feature-answers.html
https://www.oreilly.com/online-learning/insights-dashboard.html
https://www.oreilly.com/online-learning/pricing.html
https://www.oreilly.com/radar/
https://www.oreilly.com/content-marketing-solutions.html
https://www.oreilly.com/diversity/scholarship-program.html
https://learning.oreilly.com/start-tria

KeyboardInterrupt: 

# Chapter 4:  Web Crawling Models

In [37]:
class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body
def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')
def scrapeNYTimes(url):
    bs = getPage(url)
    title = bs.find("title").get_text()
    lines = bs.find_all("p", {"class":"story-content"})
    body = '\n'.join([line.get_text() for line in lines])
    return Content(url, title, body)
def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find("h1").get_text()
    body = bs.find("div",{"class","article-meta"}).get_text()
    return Content(url, title, body)

In [32]:
url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

Title: Delivering inclusive urban access: 3 uncomfortable truths
URL: https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/




Jeffrey Gutman and                












Jeffrey Gutman


Former Nonresident Fellow, Global Economy and Development 






Adie Tomer 













Adie Tomer



Senior Fellow 
- Brookings Metro











January 26, 2018




In [38]:
url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

Title: nytimes.com
URL: https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html




In [40]:
class Content:
    """
    Common base class for all articles/pages
    """
    
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body
    def print(self):
        """
        Flexible printing function controls output
        """
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))
        
class Website:
    """ 
    Contains information about website structure
    """
    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [39]:
class Crawler:
    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None        
        return BeautifulSoup(req.text, 'html.parser')
    def safeGet(self, pageObj, selector):
        """
        Utility function used to get a content string from a
        Beautiful Soup object and a selector. Returns an empty
        string if no object is found for the given selector
        """
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join(
            [elem.get_text() for elem in selectedElems])
        return ''
    def parse(self, site, url):
        """
        Extract content from a given page URL
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()


In [42]:
crawler = Crawler()
siteData = [
['O\'Reilly Media', 'http://oreilly.com',
'h1', 'section#product-description'],
['Reuters', 'http://reuters.com', 'h1',
'div.StandardArticleBody_body_1gnLA'],
['Brookings', 'http://www.brookings.edu',
'h1', 'div.post-body'],
['New York Times', 'http://nytimes.com',
'h1', 'p.story-content']
]

websites = []

for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))
    
crawler.parse(websites[0], 'http://shop.oreilly.com/product/'\
'0636920028154.do')
crawler.parse(websites[1], 'http://www.reuters.com/article/'\
'us-usa-epa-pruitt-idUSKBN19W2D0')
crawler.parse(websites[2], 'https://www.brookings.edu/blog/'\
'techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')
crawler.parse(websites[3], 'https://www.nytimes.com/2018/01/'\
'28/business/energy-environment/oil-boom.html')