In [4]:
import requests
from bs4 import BeautifulSoup

In [2]:
def getPage(url):
    """
    Utilty function used to get a Beautiful Soup object from a given URL
    """

    session = requests.Session()
    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}
    try:
        req = session.get(url, headers=headers)
    except requests.exceptions.RequestException:
        return None
    bs = BeautifulSoup(req.text, "html.parser")
    return bs

## Dealing with different website layouts

In [37]:
import requests


class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body


def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')


def scrapeNYTimes(url):
    bs = getPage(url)
    title = bs.find("h1").text
    lines = bs.find_all("p", {"class": "story-content"})
    body = '\n'.join([line.text for line in lines])
    return Content(url, title, body)


def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find("h1").text
    body = bs.find("div", {"class", "post-body"}).text
    return Content(url, title, body)


url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

Title: Delivering inclusive urban access: 3 uncomfortable truths
URL: https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/


The past few decades have been filled with a deep optimism about the role of cities and suburbs across the world. These engines of economic growth host a majority of world population, are major drivers of economic innovation, and have created pathways to opportunities for untold amounts of people.	
Authors






Jeffrey Gutman
Senior Fellow - Global Economy and Development







Adie Tomer
Fellow - Metropolitan Policy Program

 Twitter
AdieTomer






But all is not well within our so-called Urban Century. Rapid urbanization, rising gentrification, concentrated poverty, and shortages of basic infrastructure have combined to create spatial inequity in cities and suburbs across the globe. The challenges of housing, moving, and employing so many people have led to longer travel times, rising housing 

Title: The Men Who Want to Live Forever
URL: https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html

Would you like to live forever? Some billionaires, already invincible in every other way, have decided that they also deserve not to die. Today several biotech companies, fueled by Silicon Valley fortunes, are devoted to “life extension” — or as some put it, to solving “the problem of death.”
It’s a cause championed by the tech billionaire Peter Thiel, the TED Talk darling Aubrey de Gray, Google’s billion-dollar Calico longevity lab and investment by Amazon’s Jeff Bezos. The National Academy of Medicine, an independent group, recently dedicated funding to “end aging forever.”
As the longevity entrepreneur Arram Sabeti told The New Yorker: “The proposition that we can live forever is obvious. It doesn’t violate the laws of physics, so we can achieve it.” Of all the slightly creepy aspects to this trend, the strangest is the least noticed: The people publicly ch

In [40]:
class Content:
    """
    Common base class for all articles/pages
    """

    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        """
        Flexible printing function controls output
        """
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))


class Website:
    """ 
    Contains information about website structure
    """

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [41]:
import requests
from bs4 import BeautifulSoup


class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        """
        Utilty function used to get a content string from a Beautiful Soup
        object and a selector. Returns an empty string if no object
        is found for the given selector
        """
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, site, url):
        """
        Extract content from a given page URL
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

In [42]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body'],
    ['New York Times', 'http://nytimes.com', 'h1', 'p.story-content']
]
websites = []
for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))

crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')
crawler.parse(
    websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')
crawler.parse(
    websites[2],
    'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')
crawler.parse(
    websites[3], 
    'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')

title is:
Learning Python, 5th Edition 
Body is:

Get a comprehensive, in-depth introduction to the core Python language with this hands-on book. Based on author Mark Lutz’s popular training course, this updated fifth edition will help you quickly write efficient, high-quality code with Python. It’s an ideal way to begin, whether you’re new to programming or a professional developer versed in other languages. 

Complete with quizzes, exercises, and helpful illustrations,  this easy-to-follow, self-paced tutorial gets you started with both Python 2.7 and 3.3— the latest releases in the 3.X  and 2.X lines—plus all other releases in common use today. You’ll also learn some advanced language features that recently have become more common in Python code.

Explore Python’s major built-in object types such as numbers, lists, and dictionaries 
Create and process objects with Python statements, and learn Python’s general syntax model
Use functions to avoid code redundancy and package code for r

title is:
Idea to Retire: Old methods of policy education
Idea to Retire: Old methods of policy education
Body is:

Public policy and public affairs schools aim to train competent creators and implementers of government policy. While drawing on the principles that gird our economic and political systems to provide a well-rounded education, like law schools and business schools, policy schools provide professional training. They are quite distinct from graduate programs in political science or economics which aim to train the next generation of academics. As professional training programs, they add value by imparting both the skills which are relevant to current employers, and skills which we know will be relevant as organizations and societies evolve. 
The relevance of the skills that policy programs impart to address problems of today and tomorrow bears further discussion. We are living through an era in which societies are increasingly interconnected. The wide-scale adoption of devic

title is:
Oil Boom Gives the U.S. a New Edge in Energy and Diplomacy
Body is:
HOUSTON — A substantial rise in oil prices in recent months has led to a resurgence in American oil production, enabling the country to challenge the dominance of Saudi Arabia and dampen price pressures at the pump.
The success has come in the face of efforts by Saudi Arabia and its oil allies to undercut the shale drilling spree in the United States. Those strategies backfired and ultimately ended up benefiting the oil industry.
Overcoming three years of slumping prices proved the resiliency of the shale boom. Energy companies and their financial backers were able to weather market turmoil — and the maneuvers of the global oil cartel — by adjusting exploration and extraction techniques.
After a painful shakeout in the industry that included scores of bankruptcies and a significant loss of jobs, a steadier shale-drilling industry is arising, anchored by better-financed companies.
With the price of West Texas 

## Crawling through sites with search

In [43]:
class Content:
    """Common base class for all articles/pages"""

    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url

    def print(self):
        """
        Flexible printing function controls output
        """
        print("New article found for topic: {}".format(self.topic))
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))

In [44]:
class Website:
    """Contains information about website structure"""

    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [48]:
import requests
from bs4 import BeautifulSoup


class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            return childObj[0].get_text()
        return ""

    def search(self, topic, site):
        """
        Searches a given website for a given topic and records all pages found
        """
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs["href"]
            # Check to see whether it's a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print("Something was wrong with that page or URL. Skipping!")
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()


crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
        'article.product-result', 'p.title a', True, 'h1', 'section#product-description'],
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body']
]
sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2],
                         row[3], row[4], row[5], row[6], row[7]))

topics = ['python', 'data science']
for topic in topics:
    print("GETTING INFO ABOUT: " + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)

GETTING INFO ABOUT: python
New article found for topic: python
TITLE: JAKARTA (Reuters) - An Indonesian farmer has been found dead inside a 7-meter (23-foot) python after being reported missing on his failure to return home from work on a palm plantation. Village officials cut open the swollen body of the snake in a graphic video taken by a resident of Mamuju, on the eastern island of Sulawesi, the site of the incident. The victim’s legs,  encased in rubber boots, emerge as the snake is pulled apart. Family members and neighbors of the 26-year-old victim, Akbar, had launched a search when he failed to come home for more than 24 hours, a resident of the area told Reuters. A 7-metre (23-foot) python is being cut open to reveal a young Indonesian man in the village of Salobiru, in a remote part of the West Sulawesi province, Indonesia in this still image taken from video on March 26, 2017. Courtesy of Andi Fathir/via REUTERS TV “We saw a python that couldn’t move properly and it’s belly w

New article found for topic: python
TITLE: Forest department officials on Saturday (September 24) promised strict action after a python attacked a man when a group was trying to click a photo with it. The incident took place on Friday (September 23) after forest department officials removed the python from the premises of a school in northwestern Rajasthan state. They were posing for pictures with it after successfully capturing it. Deputy Conservator of Forests, K.G.Shrivastav, said the incident was unexpected as all officers were highly trained. “I wanted the snake to be released in my presence, but the officials released it before I could reach the spot. I had no clue that civilians were also present at the spot and something could go wrong as all the officials are highly trained. Clicking selfies with the snake and releasing the photos publicly is an offense under the Wildlife Act and we will issue notice to all involved in this incident,” Shrivastav said. The victim, Ashok Bishnoi

New article found for topic: python
TITLE: MIAMI (Reuters) - Engineers in the Everglades stumbled upon a near-record-breaking Burmese python measuring more than 18 feet long during a routine inspection of levees on Tuesday, a water management district spokesman said. The snake, measuring at 18 feet 2 inches, fell short of the state record by 6 inches, according to the Florida Fish and Wildlife Conservation Commission. Last year, a snake collector in the state discovered the largest python on record there, measuring 18 feet 8 inches, commission spokeswoman Katie Johnson said. The pythons, which can grow to more than 20 feet in their native habitat in Southeast Asia, are one of the most problematic invaders of Florida’s sprawling Everglades wetlands. A near record-breaking Burmese Python measuring more than 18-feet long (5.5 meters) is shown in this January 4, 2014 handout photo provided by South Florida Water Management District January 5, 2014 in Everglades National Park near Miami, Fl

KeyboardInterrupt: 

## Crawling Sites through Links

In [68]:
class Website:

    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag


class Content:

    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        print("URL: {}".format(self.url))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))

In [69]:
import re


class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = []

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, self.site.titleTag)
            body = self.safeGet(bs, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

    def crawl(self):
        """
        Get pages from website home page
        """
        bs = self.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            targetPage = targetPage.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if not self.site.absoluteUrl:
                    targetPage = '{}{}'.format(self.site.url, targetPage)
                self.parse(targetPage)


reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)',
                  False, 'h1', 'div.StandardArticleBody_body_1gnLA')
crawler = Crawler(reuters)
crawler.crawl()

GETTING https://www.reuters.com
GETTING https://www.reuters.com/article/us-usa-trump-5g/trump-national-security-team-sees-building-5g-network-as-option-idUSKBN1FH103
URL: https://www.reuters.com/article/us-usa-trump-5g/trump-national-security-team-sees-building-5g-network-as-option-idUSKBN1FH103
TITLE: Trump security team sees building U.S. 5G network as option
BODY:
WASHINGTON (Reuters) - President Donald Trump’s national security team is looking at options to counter the threat of China spying on U.S. phone calls that include the government building a super-fast 5G wireless network, a senior administration official said on Sunday. The official, confirming the gist of a report from Axios.com, said the option was being debated at a low level in the administration and was six to eight months away from being considered by the president himself. The 5G network concept is aimed at addressing what officials see as China’s threat to U.S. cyber security and economic security. The Trump admini

URL: https://www.reuters.com/article/us-usa-immigration-manchin/democratic-senator-criticizes-pelosis-immigration-comment-idUSKBN1FH0RC
TITLE: Democratic senator criticizes Pelosi's immigration comment
BODY:
WASHINGTON (Reuters) - U.S. Senator Joe Manchin, a moderate Democrat, said on Sunday he thought a new White House immigration plan was a good starting point, and he criticized House Democratic leader Nancy Pelosi for dismissing it as a way to “make America white again.” “We don’t need that type of rhetoric on either side, from Nancy, (Republican House Speaker) Paul Ryan or anybody else,” said Manchin, a West Virginian and a leader of a bipartisan Senate group working on immigration. He spoke on CNN’s “State of the Union” program. Manchin’s comments highlighted differences among Democrats ahead of a Feb. 8 deadline for the U.S. Congress to pass another spending bill and try to reach an immigration agreement that would also protect up to 1.8 million illegal immigrants brought to the 

URL: https://www.reuters.com/article/us-afghanistan-blast/militants-attack-afghan-army-post-near-military-academy-in-capital-idUSKBN1FI07M?il=0
TITLE: Militants attack Afghan army post near military academy in capital
BODY:
KABUL (Reuters) - At least four militants attacked an army outpost near one of Afghanistan’s main military academies on Monday and at least one soldier was killed and three wounded, a defense ministry official said. The attack in the western outskirts of the capital, Kabul, came two days after an ambulance bomb in the center of the city killed more than 100 people and just over a week after another attack on the Hotel Intercontinental killed more than 20. Both of those attacks were claimed by the Taliban. Ministry of Defence officials said the militants attacked the outpost near the well-defended Marshal Fahim military academy just before dawn. One of the attackers blew himself up, one had been killed and two were still fighting. One soldier had been killed and thre

KeyboardInterrupt: 

## Crawling multiple page types

In [1]:
class Website:
    """Common base class for all articles/pages"""

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        

In [2]:
class Product(Website):
    """Contains information for scraping a product page"""

    def __init__(self, name, url, titleTag, productNumber, price):
        Website.__init__(self, name, url, TitleTag)
        self.productNumberTag = productNumberTag
        self.priceTag = priceTag


class Article(Website):
    """Contains information for scraping an article page"""

    def __init__(self, name, url, titleTag, bodyTag, dateTag):
        Website.__init__(self, name, url, titleTag)
        self.bodyTag = bodyTag
        self.dateTag = dateTag

In [None]:


def parsePage(url):
    
    if '/ideas/' in url:
        

oreilly = Website('O\'Reilly', 'https://oreilly.com', 'h1' '')        