In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
def getPage(url):
    """
    Utilty function used to get a Beautiful Soup object from a given URL
    """

    session = requests.Session()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
    try:
        req = session.get(url, headers=headers)
    except requests.exceptions.RequestException:
        return None
    bs = BeautifulSoup(req.text, 'html.parser')
    return bs

## Dealing with different website layouts

In [3]:
import requests

class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body


def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')


def scrapeNYTimes(url):
    bs = getPage(url)
    title = bs.find('h1').text
    lines = bs.select('div.StoryBodyCompanionColumn div p')
    body = '\n'.join([line.text for line in lines])
    return Content(url, title, body)

def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find('h1').text
    body = bs.find('div', {'class', 'post-body'}).text
    return Content(url, title, body)


url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

Title: Delivering inclusive urban access: 3 uncomfortable truths
URL: https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/


The past few decades have been filled with a deep optimism about the role of cities and suburbs across the world. These engines of economic growth host a majority of world population, are major drivers of economic innovation, and have created pathways to opportunities for untold amounts of people.	






Jeffrey Gutman
Nonresident Senior Fellow - Global Economy and Development







Adie Tomer
Fellow - Metropolitan Policy Program

 Twitter
AdieTomer






But all is not well within our so-called Urban Century. Rapid urbanization, rising gentrification, concentrated poverty, and shortages of basic infrastructure have combined to create spatial inequity in cities and suburbs across the globe. The challenges of housing, moving, and employing so many people have led to longer travel times, rising hous

Title: The Men Who Want to Live Forever
URL: https://www.nytimes.com/2018/01/25/opinion/sunday/silicon-valley-immortality.html

Would you like to live forever? Some billionaires, already invincible in every other way, have decided that they also deserve not to die. Today several biotech companies, fueled by Silicon Valley fortunes, are devoted to “life extension” — or as some put it, to solving “the problem of death.”
It’s a cause championed by the tech billionaire Peter Thiel, the TED Talk darling Aubrey de Gray, Google’s billion-dollar Calico longevity lab and investment by Amazon’s Jeff Bezos. The National Academy of Medicine, an independent group, recently dedicated funding to “end aging forever.”
As the longevity entrepreneur Arram Sabeti told The New Yorker: “The proposition that we can live forever is obvious. It doesn’t violate the laws of physics, so we can achieve it.” Of all the slightly creepy aspects to this trend, the strangest is the least noticed: The people publicly ch

In [4]:
class Content:
    """
    Common base class for all articles/pages
    """
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        """
        Flexible printing function controls output
        """
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

class Website:
    """ 
    Contains information about website structure
    """

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [5]:
import requests
from bs4 import BeautifulSoup


class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')

    def safeGet(self, pageObj, selector):
        """
        Utilty function used to get a content string from a Beautiful Soup
        object and a selector. Returns an empty string if no object
        is found for the given selector
        """
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, site, url):
        """
        Extract content from a given page URL
        """
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

In [6]:
crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'h1', 'div.content > span'],
    ['Reuters', 'http://reuters.com', 'h1', 'div.StandardArticleBody_body'],
    ['Brookings', 'http://www.brookings.edu', 'h1', 'div.post-body'],
    ['New York Times', 'http://nytimes.com', 'h1', 'div.StoryBodyCompanionColumn div p']
]
websites = []
for row in siteData:
    websites.append(Website(row[0], row[1], row[2], row[3]))

crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')
crawler.parse(
    websites[1], 'http://www.reuters.com/article/us-usa-epa-pruitt-idUSKBN19W2D0')
crawler.parse(
    websites[2],
    'https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')
crawler.parse(
    websites[3], 
    'https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html')

URL: http://shop.oreilly.com/product/0636920028154.do
TITLE: Learning Python, 5th Edition
BODY:
Get a comprehensive, in-depth introduction to the core Python language with this hands-on book. Based on author Mark Lutz’s popular training course, this updated fifth edition will help you quickly write efficient, high-quality code with Python. It’s an ideal way to begin, whether you’re new to programming or a professional developer versed in other languages.Complete with quizzes, exercises, and helpful illustrations,  this easy-to-follow, self-paced tutorial gets you started with both Python 2.7 and 3.3— the latest releases in the 3.X  and 2.X lines—plus all other releases in common use today. You’ll also learn some advanced language features that recently have become more common in Python code.Explore Python’s major built-in object types such as numbers, lists, and dictionariesCreate and process objects with Python statements, and learn Python’s general syntax modelUse functions to avoid 

URL: https://www.nytimes.com/2018/01/28/business/energy-environment/oil-boom.html
TITLE: Oil Boom Gives the U.S. a New Edge in Energy and Diplomacy
BODY:
HOUSTON — A substantial rise in oil prices in recent months has led to a resurgence in American oil production, enabling the country to challenge the dominance of Saudi Arabia and dampen price pressures at the pump.
The success has come in the face of efforts by Saudi Arabia and its oil allies to undercut the shale drilling spree in the United States. Those strategies backfired and ultimately ended up benefiting the oil industry.
Overcoming three years of slumping prices proved the resiliency of the shale boom. Energy companies and their financial backers were able to weather market turmoil — and the maneuvers of the global oil cartel — by adjusting exploration and extraction techniques.
After a painful shakeout in the industry that included scores of bankruptcies and a significant loss of jobs, a steadier shale-drilling industry is a

## Crawling through sites with search

In [7]:
class Content:
    """Common base class for all articles/pages"""

    def __init__(self, topic, url, title, body):
        self.topic = topic
        self.title = title
        self.body = body
        self.url = url

    def print(self):
        """
        Flexible printing function controls output
        """
        print('New article found for topic: {}'.format(self.topic))
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

In [8]:
class Website:
    """Contains information about website structure"""

    def __init__(self, name, url, searchUrl, resultListing, resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

In [9]:
import requests
from bs4 import BeautifulSoup

class Crawler:

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html5lib')

    def safeGet(self, pageObj, selector):
        childObj = pageObj.select(selector)
        if childObj is not None and len(childObj) > 0:
            elems = []
            for elem in childObj:
                elems.append(elem.get_text()) 
            return '\n'.join(elems)
        return ''

    def search(self, topic, site):
        """
        Searches a given website for a given topic and records all pages found
        """
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            # Check to see whether it's a relative or an absolute URL
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print('Something was wrong with that page or URL. Skipping!')
                return
            title = self.safeGet(bs, site.titleTag)
            body = self.safeGet(bs, site.bodyTag)
            if title != '' and body != '':
                content = Content(topic, title, body, url)
                content.print()


crawler = Crawler()

siteData = [
    ['O\'Reilly Media', 'http://oreilly.com', 'https://ssearch.oreilly.com/?q=',
        'article.product-result', 'p.title a', True, 'h1', 'div.product-description'],
    ['Reuters', 'http://reuters.com', 'http://www.reuters.com/search/news?blob=', 'div.search-result-content',
        'h3.search-result-title a', False, 'h1', 'div.StandardArticleBody_body'],
    ['Brookings', 'http://www.brookings.edu', 'https://www.brookings.edu/search/?s=',
        'div.list-content article', 'h4.title a', True, 'h1', 'div.post-body, div.techstream--content']
]
sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2],
                         row[3], row[4], row[5], row[6], row[7]))

topics = ['python', 'data science']
for topic in topics:
    print('GETTING INFO ABOUT: ' + topic)
    for targetSite in sites:
        crawler.search(topic, targetSite)

GETTING INFO ABOUT: python
New article found for topic: python
URL: Python for Scientists
TITLE: 
        
          Get the free ebook 
        
        More and more, scientists are seeing tech seep into their work. From data collection to team management, various tools exist to make your lives easier. But, where to start? Python is growing in popularity in scientific circles, due to its simple syntax and seemingly endless libraries. This free ebook gets you started on the path to a more streamlined process. With a collection of chapters from our top scientific books, you'll learn about the various options that await you as you strengthen your computational thinking.This free ebook includes chapters from:Python for Data AnalysisEffective Computation in PhysicsBioinformatics Data SkillsPython Data Science Handbook
      
BODY:
https://www.oreilly.com/programming/free/python-for-scientists.csp
New article found for topic: python
URL: Python Data for Developers
TITLE: 
        
        

New article found for topic: python
URL: Monty Python star Terry Jones dies aged 77
TITLE: LONDON (Reuters) - Terry Jones, one of the British Monty Python comedy team and director of religious satire “Life of Brian”, has died at the age of 77 after a long battle with dementia, his family said on Wednesday. Born in Wales in 1942, Jones was also an author, historian and poet. He had been diagnosed in 2015 with a rare form of dementia, FTD. Jones was one of the creators of Monty Python’s Flying Circus, the British TV show that rewrote the rules of comedy with surreal sketches, characters and catchphrases, in 1969. He co-directed the team’s first film “Monty Python and the Holy Grail” with fellow Python Terry Gilliam, and directed the subsequent Life of Brian and “The Meaning of Life.” Python Michael Palin, who met Jones at Oxford University, said he was “kind, generous, supportive and passionate about living life to the full”. “He was far more than one of the funniest writer-performers of

New article found for topic: python
URL: Pine no more! Monty Python celebrates 50 years of silliness
TITLE: FILE PHOTO: People attend the Silly Walk Parade, emulating a sketch from British comedy group Monty Python's television series to mark April Fool's day in Budapest, Hungary, April 1, 2019. REUTERS/Bernadett Szabo -/File PhotoLONDON (Reuters) - In what is billed as an “extremely silly” event, hordes of Monty Python fans will gather in full Gumby attire in London on Saturday to celebrate the British comedy troupe’s 50th anniversary. Kitted out in rubber boots, sleeveless sweaters, rolled-up trousers and with knotted handkerchiefs on their heads, they will attempt to set a Guinness World Record for the Largest Gathering of People Dressed as Gumbys. “It’s all so excitingly pointless,” said Python Terry Gilliam, who will host the event. The Gumbys - also noted for their ape-like posture, habit of speaking loudly and slowly, and the catchphrase “my brain hurts” - were recurring charact

New article found for topic: python
URL: The Silicon Valley Wage Premium
The Silicon Valley Wage Premium
TITLE: 
				
	
Software application developers earn large salaries in the United States, $96,260 a year on average. But in metropolitan San Jose they earn $131,270, the highest in the country. There are many partial explanations for this—local cost of living, differences in education levels, experience, and industry—but none of them quite account for it. It turns out that developers living in San Jose have acquired the specific skills most valued by employers.

As the map below shows, there is a huge amount of variation in earnings for software application developers across regional labor markets. In large metropolitan areas like New York, they earn $105,000, but in Louisville, they earn just $72,000.


  Average Salary of Software Application Developers by Metropolitan Area, 2013



  

Similar patterns could be shown for other occupations, of course; for even within the same job t

New article found for topic: python
URL: An Atlanta organization’s mission to bring racial equity to the tech ecosystem
An Atlanta organization’s mission to bring racial equity to the tech ecosystem
TITLE: 
				
Summary
Between the COVID-19 pandemic and the tragic death of George Floyd, the country’s ongoing crisis of racism has come into stark relief. Black Americans are disproportionately diagnosed with or dying from COVID-19 due to structural conditions, while also facing major economic risks as the racial unemployment gap between white and Black populations is the widest it’s been in five years. At the same time, Black people are still vulnerable to police violence that too often occurs without consequences. While there is a great deal of work to be done to dismantle structural racism, it is imperative to use this moment to remove racial barriers and invest in long-term prosperity for Black people, enterprises, and communities.
Closing the racial wealth divide can create better hea

New article found for topic: python
URL: Skills, success, and why your choice of college matters
Skills, success, and why your choice of college matters
TITLE: 
				
	
Amidst growing frustration with the cost of higher education, complaints also abound about its quality. One critique, launched in the book Academically Adrift by two sociologists, finds little evidence that college students score better on measures of critical thinking, writing, and reasoning after attending college. This is something of a paradox, since strong evidence shows that attending college tends to raise earnings power, even for students who start with mediocre preparation. 
Our recent study uses a different approach to assess the value of a college education. We find that the particular skills listed by a college’s alumni on their resumes predict how well graduates from those schools perform in terms of earning a living, meeting debt obligations, and working for high-paying or innovative companies. Since jobs r

New article found for topic: python
URL: Modeling with Data: Tools and Techniques for Scientific Computing
Modeling with Data: Tools and Techniques for Scientific Computing
TITLE: 
				
		PREFACE



				Should you use the book? This book is intended to be a complement to the standard stats textbook, in three ways.

First, descriptive and inferential statistics are kept separate beginning with the first sentence of the first chapter. I believe that the fusing of the two is the number one cause of confusion among statistics students.

Once descriptive modeling is given its own space, and models do not necessarily have to be just preparation for a test, the options blossom. There are myriad ways to convert a subjective understanding of the world into a mathematical model, including simulations, models like the Bernoulli/Poisson distributions from traditional probability theory, ordinary least squares, and who knows what else.

If those options aren’t enough, simple models can be combined 

New article found for topic: python
URL: Forum: Debating Bush’s Wars
Forum: Debating Bush’s Wars
TITLE: 
				
		In the 
		
				Winter 2007–08 issue 
		of Survival, Philip Gordon argued that America’s strategy against terror is failing ‘because the Bush administration chose to wage the wrong war’. Survival invited former Bush speechwriter and Deputy Assistant to the President Peter Wehner and Kishore Mahbubani, Dean and Professor at the Lee Kuan Yew School of Public Policy in Singapore, to reflect on Gordon’s arguments. Their 
		comments are available in the above PDF and Philip Gordon’s response is below.

I am grateful to Peter Wehner and Kishore Mahbubani for taking the time to comment on my essay, ‘Winning the Right War’. Their comments are valuable not only because both are prominent and influential thinkers but because their divergent views on the subject help to frame the debate: Mahbubani essentially agrees with me but wishes I had ‘gone even further’ in my analysis, while Wehne

New article found for topic: python
URL: Appointments Apocalypse
Appointments Apocalypse
TITLE: 
				
Anyone who doubts that the presidential appointments process is on the verge of collapse need only look at three recent events.

On April 30, President Bush’s 101st afternoon in office, the White House dumped 61 names into the Senate confirmation process in a desperate effort to beat the Clinton administration’s dismal mark after its 100th day. Despite smashing the single-day nomination record, Bush had nominated less than 30 percent of the candidates for sub-Cabinet posts by the end of that week.

On May 2, Senate Democrats announced that they were delaying a vote on two Justice Department nominees to express their anger over a change in the process that gives home-state Senators a say about federal judicial nominees. Not to be outdone, Republicans followed suit by placing holds on four Defense nominees to remind Secretary Donald Rumsfeld that he should communicate more frequently wit

New article found for topic: data science
URL: Building Data Science Teams
TITLE: 
        
          Get the free ebook 
        
        As data science evolves to become a business necessity, the importance of assembling a strong and innovative data teams grows. In this in-depth report, data scientist DJ Patil explains the skills,perspectives, tools and processes that position data science teams for success.
      
BODY:
https://www.oreilly.com/data/free/building-data-science-teams.csp
New article found for topic: data science
URL: 2014 Data Science Salary Survey
TITLE: 
        
          Get the free ebook 
        
        As a data professional, you are invited to share your valuable insights. Take the short, anonymous salary survey here: http://www.oreilly.com/data/salarysurvey​2015.csp. It only takes about 5-10 minutes to complete. Thank you.For the second year, O'Reilly Media conducted an anonymous survey to expose the tools successful data analysts and engineers use, and how

New article found for topic: data science
URL: BRIEF-Urovant Sciences Presents Positive Clinical Efficacy & Safety Data On Lead Drug Candidate Vibegron
TITLE: May 14 (Reuters) - Urovant Sciences Ltd: * UROVANT SCIENCES PRESENTS POSITIVE CLINICAL EFFICACY & SAFETY DATA ON LEAD DRUG CANDIDATE VIBEGRON AT VIRTUAL AMERICAN UROLOGICAL ASSOCIATION ANNUAL MEETING Source text for Eikon: Further company coverage: (Reuters.Briefs@thomsonreuters.com)Our Standards:The Thomson Reuters Trust Principles.
BODY:
/article/idUSFWN2CW0IL
New article found for topic: data science
URL: BRIEF-Gilead Sciences Shows Commitment To Scientific Innovation In HIV With New Prevention, Treatment & Cure Research Data
TITLE: July 1 (Reuters) - Gilead Sciences Inc: * GILEAD SCIENCES DEMONSTRATES COMMITMENT TO SCIENTIFIC INNOVATION IN HIV WITH NEW PREVENTION, TREATMENT AND CURE RESEARCH DATA PRESENTED AT AIDS 2020: VIRTUAL * GILEAD SCIENCES - HIV TREATMENT DATA TO BE PRESENTED INCLUDES POOLED ANALYSIS OF FOUR INTERNATION

New article found for topic: data science
URL: What all policy analysts need to know about data science
What all policy analysts need to know about data science
TITLE: 
					
		
				

	
			
		

	
							Alex Engler
		
							Rubenstein Fellow - Governance Studies
					
						Twitter
		@AlexCEngler
			
		
			
		
			

			
	
	
Conversations around data science typically contain a lot of buzzwords and broad generalizations that make it difficult to understand its pertinence to governance and policy. Even when well-articulated, the private sector applications of data science can sound quite alien to public servants. This is understandable, as the problems that Netflix and Google strive to solve are very different than those government agencies, think tanks, and nonprofit service providers are focused on. This does not mean, however, that there is no public sector value in the modern field of data science. With qualifications, data science offers a powerful framework to expand our evidence-b

New article found for topic: data science
URL: Big data, meet behavioral science
Big data, meet behavioral science
TITLE: 
				America’s community colleges offer the promise of a more affordable pathway to a bachelor’s degree. Students can pay substantially less for the first two years of college, transfer to a four-year college or university, and still earn their diploma in the same amount of time. At least in theory. Most community college students—80 percent of them—enter with the intention to transfer, but only 20 percent actually do so within five years of entering college. This divide represents a classic case of what behavioralists call an intention-action gap. 
Why would so many students who enter community colleges intending to transfer fail to actually do so? Put yourself in the shoes of a 20-something community college student. You’ve worked hard for the past couple years, earning credits and paying a lot less in tuition than you would have if you had enrolled immediately in

New article found for topic: data science
URL: Artificial intelligence and data analytics in India
Artificial intelligence and data analytics in India
TITLE: 
				Advances in artificial intelligence and data analytics are propelling innovation in many parts of the world.[1] China, for example, has committed $150 billion towards its goal of becoming a world leader by 2030.[2] And while the United States government is investing only $1.1 billion in non-classified AI research, its private sector is spending billions in fields from finance and healthcare to retail and defense.[3] This is transforming a number of different sectors.[4]
	
		
				

	
			
		

	
							Shamika Ravi
		
							Non-Resident Senior Fellow - Governance Studies
					
						Twitter
		@ShamikaRavi
			
		
			
		
			

	

	
			
		

	
							Darrell M. West
		
							Vice President and Director - Governance Studies					Senior Fellow - Center for Technology Innovation
					
						Twitter
		@DarrWest
			
		
			
		
			

			
	


New article found for topic: data science
URL: Charts of the week: Advancing women and girls in science
Charts of the week: Advancing women and girls in science
TITLE: 
				“On this International Day, I urge commitment to end bias, greater investments in science, technology, engineering and math education for all women and girls as well as opportunities for their careers and longer-term professional advancement so that all can benefit from their ground-breaking future contributions.” — UN Secretary-General António Guterres
Three years ago, the UN proclaimed February 11 the International Day of Women and Girls in Science. This new designation was part of a larger effort toward closing gender gaps around the globe, as outline in the 2030 Sustainable Development Goals. Though more women are pursuing careers in science, technology, engineering, and mathematics (STEM), it is clear that gender gaps in these fields—and harmful biases– persist today.
Highlighted below are charts and commentary

New article found for topic: data science
URL: The opportunities and challenges of data analytics in health care
The opportunities and challenges of data analytics in health care
TITLE: 
				Data analytics tools have the potential to transform health care in many different ways. In the near future, routine doctor’s visits may be replaced by regularly monitoring one’s health status and remote consultations. The inpatient setting will be improved by more sophisticated quality metrics drawn from an ecosystem of interconnected digital health tools. The care patients receive may be decided in consultation with decision support software that is informed not only by expert judgments but also by algorithms that draw on information from patients around the world, some of whom will differ from the “typical” patient. Support may be customized for an individual’s personal genetic information, and doctors and nurses will be skilled interpreters of advanced ways to diagnose, track, and treat illness

New article found for topic: data science
URL: Wars of none: AI, big data, and the future of insurgency
Wars of none: AI, big data, and the future of insurgency
TITLE: 
				When U.S. Special Forces entered Afghanistan in 2001, Facebook didn’t exist, the iPhone had yet to be invented, and “A.I.” often referred to an NBA star. Seventeen years later, American special operations forces continue to ride horseback in rural Afghanistan, but information technology has advanced rapidly. Recent breakthroughs in robotics and artificial intelligence (AI) have captured the popular imagination and prompted sober talk of an impending AI revolution. Yet surprisingly little of that talk has touched on the small wars and insurgencies that have dominated U.S. foreign policy in the 21st century.	
		
				

	
			
		

	
							Chris Meserole
		
							Deputy Director - Artificial Intelligence and Emerging Technology Initiative					Fellow - Foreign Policy
					
						Twitter
		chrismeserole
			
		
			
		
			


New article found for topic: data science
URL: Using big data to link poor farmers to finance
Using big data to link poor farmers to finance
TITLE: 
				Two billion adults in the world are excluded from credit. The situation is especially bad for small farmers in rural areas who are unable to access loans to invest in their farms, trapped in a vicious circle of low productivity, low yields, and poor income. The Initiative for Smallholder Finance estimates that smallholders globally access just $50 billion of the $200 billion of lending that they require to grow their operations and improve their lives.	
		
				

	
			
		

	
							Roy Parizat
		
							Fund Manager, BioCarbon Fund Initiative for Sustainable Forest Landscapes - World Bank
		
			
		
			

	

	
			
			H
		
	

	
							Heinz-Wilhelm Strubenhoff
		
							Agribusiness Program Manager, World Bank Group
		
			
		
			

			
	
	
The global growth of microfinance banks has created new opportunities for financial inclusion, with ou

New article found for topic: data science
URL: Drawing from improvement science to bridge education research and practice
Drawing from improvement science to bridge education research and practice
TITLE: 
				A three-legged stool can be tough to balance if the legs are uneven. We’ve been thinking a lot about how to do just that with the launch of the  Millions Learning Real-time Scaling Labs starting in Brazil, Jordan, Tanzania, Côte d’Ivoire, and the U.S. city of Philadelphia. How do we balance and give equal weight to our three primary objectives: learn from, document, and support education interventions in the process of scaling? We think that drawing from principles behind adaptive and iterative learning methodologies, such as improvement science, can help.	
		
				

	
			
		

	
							Jenny Perlman Robinson
		
							Senior Fellow - Global Economy and Development, Center for Universal Education
					
						Twitter
		@JennyPerlman
			
		
			
		
			

			
	
	
Our starting point with 

New article found for topic: data science
URL: The “smart society” of the future doesn’t look like science fiction
The “smart society” of the future doesn’t look like science fiction
TITLE: 
					
		
				

	
			
		

	
							Bhaskar Chakravorti
		
							Non-Resident Senior Fellow - Brookings India
		
			
		
			

	

	
			
			R
		
	

	
							Ravi Shankar Chaturvedi
		
							Associate Director for Research - Fletcher’s Institute for Business in the Global Context, Tufts University					Doctoral Research Fellow for Innovation - Fletcher’s Institute for Business in the Global Context, Tufts University
		
			
		
			

			
	
	
What is a “smart” society? While flights of imagination from science-fiction writers, filmmakers, and techno-futurists involve things like flying cars and teleportation, in practice smart technology is making inroads in a piecemeal fashion, often in rather banal circumstances. In Chicago, for example, predictive analytics is improving health inspections schedules in re

New article found for topic: data science
URL: A new alphabet for Europe: Algorithms, big data, and the computer chip
A new alphabet for Europe: Algorithms, big data, and the computer chip
TITLE: 
				If the biggest disrupter of the last few decades was Deng Xiaoping—the father of modern China—the big disrupter of the next few decades may well be John McCarthy. McCarthy, an American professor of Computer Science, is believed by many to be the father of artificial intelligence. Interestingly, the two have an epiphany in common. In 1979, Deng, a lifelong communist, visited the United States and came back a believer in market capitalism. In 1968, after a two-day visit to Czechoslovakia, McCarthy, who was raised as a communist by his immigrant parents, became a free-market Republican.
The ideas of computer scientists and mathematicians like McCarthy are radically transforming the way we communicate, and the way we make, buy, and sell goods and services. The changes will likely be so great 

## Crawling Sites through Links

In [10]:
class Website:

    def __init__(self, name, url, targetPattern, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.targetPattern = targetPattern
        self.absoluteUrl = absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag


class Content:

    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

    def print(self):
        print('URL: {}'.format(self.url))
        print('TITLE: {}'.format(self.title))
        print('BODY:\n{}'.format(self.body))

In [11]:
import re


class Crawler:
    def __init__(self, site):
        self.site = site
        self.visited = []

    def getPage(self, url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html5lib')

    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''

    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs, self.site.titleTag)
            body = self.safeGet(bs, self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()

    def crawl(self):
        """
        Get pages from website home page
        """
        bs = self.getPage(self.site.url)
        targetPages = bs.findAll('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            targetPage = targetPage.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if not self.site.absoluteUrl:
                    targetPage = '{}{}'.format(self.site.url, targetPage)
                self.parse(targetPage)


reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)',
                  False, 'h1', 'div.StandardArticleBody_body')
crawler = Crawler(reuters)
crawler.crawl()

## Crawling multiple page types

In [12]:
class Website:
    """Common base class for all articles/pages"""

    def __init__(self, name, url, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        

In [13]:
class Product(Website):
    """Contains information for scraping a product page"""

    def __init__(self, name, url, titleTag, productNumber, price):
        Website.__init__(self, name, url, TitleTag)
        self.productNumberTag = productNumberTag
        self.priceTag = priceTag

class Article(Website):
    """Contains information for scraping an article page"""

    def __init__(self, name, url, titleTag, bodyTag, dateTag):
        Website.__init__(self, name, url, titleTag)
        self.bodyTag = bodyTag
        self.dateTag = dateTag

In [14]:

def parsePage(url):
    
    if '/ideas/' in url:
        

oreilly = Website('O\'Reilly', 'https://oreilly.com', 'h1' '')        

IndentationError: expected an indented block (<ipython-input-14-684525c5e541>, line 6)