# 1

In [70]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html, 'html.parser')
print(bs.h1)

<h1>An Interesting Title</h1>


In [71]:
bs

<html>
<head>
<title>A Useful Page</title>
</head>
<body>
<h1>An Interesting Title</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</body>
</html>

In [69]:
bs.body.div.contents

['\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n']

### 异常处理

In [78]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
try:
    html = urlopen('http://pythonscraping.com/pages/page1.html')
except HTTPError as e:
    print(e)
    # 返回空值，中断程序，或者执行另一个方案
except URLError as e:
    print('The server can not be found!')
else:
    # 程序继续。注意:如果你已经在上面异常捕捉那一段代码里返回或中断(break)， 
    # 那么就不需要使用else语句了，这段代码也不会执行
    print('It worked!')

It worked!


In [80]:
bs

<html>
<head>
<title>A Useful Page</title>
</head>
<body>
<h1>An Interesting Title</h1>
<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>
</body>
</html>

In [82]:
bs.nonExistingTag ## return None
# bs.nonExistingTag.someTag ## return AttributeError

In [83]:
try:
    badContent = bs.nonExistingTag.someTag
except AttributeError as e:
    print('Tag is not found!')
else:
    if badContent == None:
        print('Tag is not found!')
    else:
        print(badContent)

Tag is not found!




**封装函数**

In [85]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    
    try:
        bs = BeautifulSoup(html.read(),'html.parser')
        title = bs.body.h1
    except AttributeError:
        return None
    else:
        return title
    
url = 'http://www.pythonscraping.com/pages/page1.html'

getTitle(url)

<h1>An Interesting Title</h1>

# 2

In [98]:
url = 'https://www.pythonscraping.com/pages/warandpeace.html'

html = urlopen(url)
bs = BeautifulSoup(html, 'html.parser')

nameList = bs.find_all('span', {'class':{'green','red'}}) ## type: bs4.element.ResultSet
for name in nameList:
    print(name.get_text()) ## get_text() return string.

Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend, no longer
my 'faithful slave,' as you call yourself! But how do you do? I see
I have frightened you- sit down and tell me all the news.
Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
If you have nothing better to do, Count [or Prince], and if the
prospect of spending an evening with a poor invalid is not too
terrible, I shall be very charmed to see you tonight between 7 and 10-
Annette Scherer.
Heavens! what a virulent attack!
the prince
Anna Pavlovna
First of all, dear friend, tell me how you are. Set your friend's
mind at rest,
Can one be well while suffering morally? Can one be calm in times
like these if one

In [133]:
html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

In [115]:
for child in bs.find('table',{'id':'giftList'}):
         print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [124]:
for child in bs.find('img',{'src':'../img/gifts/img1.jpg'}):
         print(child)

In [128]:
bs.find('img',{'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text()

'\n$15.00\n'

正则表达式的应用

In [129]:
import re

In [138]:
imgs = bs.find_all('img',{'src':re.compile('\.\.\/img\/gifts\/img.*\.jpg')})
for img in imgs:
    print(img['src'])

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


In [145]:
imgs[1].attrs['src']

'../img/gifts/img2.jpg'

In [146]:
bs.find_all(lambda tag:len(tag.attrs) == 2)

[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img src="../img/gifts/img3.jpg"/>
 </td>

# 3

In [176]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from selenium import webdriver
import os

In [241]:
site = 'https://en.wikipedia.org/wiki/Andrea_Martin'

driver = webdriver.Chrome(executable_path = '/Users/tushimin/Desktop/chromedriver')

driver.get(site)
soup = BeautifulSoup(driver.page_source, 'html.parser')

driver.close()

  driver = webdriver.Chrome(executable_path = '/Users/tushimin/Desktop/chromedriver')


In [197]:
links = soup.find('div',{'id':'bodyContent'}).find_all('a',href = re.compile('^(/wiki/)((?!:).)*$'))

for link in links:
    print(link.attrs['href'])

/wiki/Kevin_Bacon_(disambiguation)
/wiki/Philadelphia
/wiki/Kevin_Bacon_filmography
/wiki/Kyra_Sedgwick
/wiki/Sosie_Bacon
/wiki/Edmund_Bacon_(architect)
/wiki/Michael_Bacon_(musician)
/wiki/Holly_Near
/wiki/Footloose_(1984_film)
/wiki/JFK_(film)
/wiki/A_Few_Good_Men
/wiki/Apollo_13_(film)
/wiki/Mystic_River_(film)
/wiki/Balto_(film)
/wiki/Sleepers
/wiki/The_Woodsman_(2004_film)
/wiki/Animal_House
/wiki/Diner_(1982_film)
/wiki/Tremors_(1990_film)
/wiki/Crazy,_Stupid,_Love
/wiki/Friday_the_13th_(1980_film)
/wiki/Flatliners
/wiki/The_River_Wild
/wiki/Wild_Things_(film)
/wiki/Stir_of_Echoes
/wiki/Hollow_Man
/wiki/Frost/Nixon_(film)
/wiki/Black_Mass_(film)
/wiki/Patriots_Day_(film)
/wiki/Fox_Broadcasting_Company
/wiki/The_Following
/wiki/HBO
/wiki/Taking_Chance
/wiki/Golden_Globe_Award
/wiki/Screen_Actors_Guild_Award
/wiki/Primetime_Emmy_Award
/wiki/I_Love_Dick_(TV_series)
/wiki/Golden_Globe_Award_for_Best_Actor_%E2%80%93_Television_Series_Musical_or_Comedy
/wiki/The_Guardian
/wiki/Academy_

In [215]:
import datetime
import random 
from urllib.request import urlopen
import re
from selenium import webdriver

random.seed(datetime.datetime.now())

def getLinks(articleUrl):
    site = f'http://en.wikipedia.org{articleUrl}'
    driver = webdriver.Chrome(executable_path = '/Users/tushimin/Desktop/chromedriver')
    driver.get(site)
    bs = BeautifulSoup(driver.page_source, 'html.parser')
    driver.close()
    
    links = bs.find('div',{'id':'bodyContent'}).find_all('a',href = re.compile('^(/wiki/)((?!:).)*$'))
    
    return links

links = getLinks('/wiki/Kevin_Bacon')

while len(links) > 0:
    newActicleUrl = links[random.randint(2,len(links)-1)].attrs['href']
    print(newActicleUrl)
    links = getLinks(newActicleUrl)

  driver = webdriver.Chrome(executable_path = '/Users/tushimin/Desktop/chromedriver')


/wiki/Bob_Odenkirk
/wiki/Andrea_Martin
/wiki/Bobby%27s_World
/wiki/Xploration_Station
/wiki/46th_Daytime_Emmy_Awards
/wiki/List_of_Daytime_Emmy_Award_winners
/wiki/Daytime_Emmy_Award_for_Outstanding_Younger_Performer_in_a_Drama_Series
/wiki/Spencer_Cassadine
/wiki/Emma_Drake
/wiki/General_Hospital
/wiki/Children_of_General_Hospital
/wiki/Josslyn_Jacks
/wiki/General_Hospital_characters_(2010s)#Ewen_Keenan
/wiki/Windy_City_Times
/wiki/San_Francisco_Sentinel
/wiki/Los_Angeles_Times
/wiki/Times_Mirror_Co.
/wiki/Cable_television
/wiki/AMIS_(ISP)


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=108.0.5359.94)
Stacktrace:
0   chromedriver                        0x000000010eb07f38 chromedriver + 4910904
1   chromedriver                        0x000000010ea87a03 chromedriver + 4385283
2   chromedriver                        0x000000010e6cc747 chromedriver + 472903
3   chromedriver                        0x000000010e6a2ff5 chromedriver + 303093
4   chromedriver                        0x000000010e73bb0f chromedriver + 928527
5   chromedriver                        0x000000010e751763 chromedriver + 1017699
6   chromedriver                        0x000000010e736ee3 chromedriver + 909027
7   chromedriver                        0x000000010e70130c chromedriver + 688908
8   chromedriver                        0x000000010e70288e chromedriver + 694414
9   chromedriver                        0x000000010ead51de chromedriver + 4702686
10  chromedriver                        0x000000010ead9b19 chromedriver + 4721433
11  chromedriver                        0x000000010eae128e chromedriver + 4752014
12  chromedriver                        0x000000010eada91a chromedriver + 4725018
13  chromedriver                        0x000000010eaaeb02 chromedriver + 4545282
14  chromedriver                        0x000000010eaf9888 chromedriver + 4851848
15  chromedriver                        0x000000010eaf9a05 chromedriver + 4852229
16  chromedriver                        0x000000010eb0fe5f chromedriver + 4943455
17  libsystem_pthread.dylib             0x00007fff20317950 _pthread_start + 224
18  libsystem_pthread.dylib             0x00007fff2031347b thread_start + 15


In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
from selenium import webdriver

pages = set()

def getLinks(pageUrl):
    global pages
    
    site = f'http://en.wikipedia.org{pageUrl}'
    driver = webdriver.Chrome(executable_path = '/Users/tushimin/Desktop/chromedriver')
    driver.get(site)
    bs = BeautifulSoup(driver.page_source,'html_parser')
    driver.close()
    
    links = bs.find('div',{'id':'bodyContent'}).find_all('a',href = re.compile('^(/wiki/)((?!:).)*$'))
    
    for link in links:
        if link.attrs['href'] not in pages:
            newPage = link.attrs['href']
            pages.add(newPage)
            getLinks(newPage)

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
from selenium import webdriver

pages = set()

def getLinks(pageUrl):
    global pages
    
    site = f'http://en.wikipedia.org{pageUrl}'
    driver = webdriver.Chrome(executable_path = '/Users/tushimin/Desktop/chromedriver')
    driver.get(site)
    bs = BeautifulSoup(driver.page_source,'html_parser')
    driver.close()
    
    try:
        print(bs.find('h1').find('span').get_text())
        print(bs.find(id ='mw-content-text').find_all('p')[1])
        print(bs.find(id='ca-edit').find('a').attrs['href'])
    except AttributeError:
        print('lost some tags!')
    
    links = bs.find('div',{'id':'bodyContent'}).find_all('a',href = re.compile('^(/wiki/)((?!:).)*$'))
    
    for link in links:
        if link.attrs['href'] not in pages:
            newPage = link.attrs['href']
            pages.add(newPage)
            getLinks(newPage)

getLinks('')

In [281]:
import requests

class Content:
    def __init__(self, url, title, body):
        self.url = url
        self.title = title
        self.body = body

def getPage(url):
    req = requests.get(url)
    return BeautifulSoup(req.text, 'html.parser')

def scrapeMedium(url):
    bs = getPage(url)
    title = bs.find("h1",id='69eb').text
    lines = bs.find_all("p")
    body = '\n'.join([line.text for line in lines])
    return Content(url, title, body)

def scrapeBrookings(url):
    bs = getPage(url)
    title = bs.find("h1").text
    body = bs.find("div",{"class","post-body"}).text
    return Content(url, title, body)

In [283]:
url = 'https://www.brookings.edu/blog/future-development/2018/01/26/delivering-inclusive-urban-access-3-uncomfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.ubrl))
print(content.body)
url = 'https://medium.com/analytics-vidhya/basics-of-forecast-accuracy-db704b0b001b'
content = scrapeMedium(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)


In [305]:
import requests
from bs4 import BeautifulSoup

class Website:
    def __init__(self,name,url,titleTag,bodyTag):
        self.name = name
        self.url = url
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        
class Content:
    def __init__(self,url,title,body):
        self.url = url
        self.title = title
        self.body = body
        
    def print(self):
        print('url:{}'.format(self.url))
        print('title:{}'.format(self.title))
        print('body:{}'.format(self.body))
        
class Crawler:
    def getPage(self,url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    
    def safeGet(self, pageObj, selector):
        selectedItems = pageObj.select(selector)
        if selectedItems is not None and len(selectedItems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedItems])
        return ''
    
    def parse(self,site,url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs,site.titleTag)
            body = self.safeGet(bs,site.bodyTag)
        if title != '' and body != '':
            content = Content(url,title,body)
            content.print()

In [306]:
crawler = Crawler()
siteData = [
         ['O\'Reilly Media', 'http://oreilly.com',
         'h1', 'section#product-description'],
         ['Reuters', 'http://reuters.com', 'h1',
         'div.StandardArticleBody_body_1gnLA'],
         ['Brookings', 'http://www.brookings.edu',
         'h1', 'div.post-body'],
         ['New York Times', 'http://nytimes.com',
         'h1', 'p.story-content']
     ]
websites = []
for row in siteData:
    websites.append(Website(row[0],row[1],row[2],row[3]))
crawler.parse(websites[0], 'http://shop.oreilly.com/product/0636920028154.do')
crawler.parse(websites[1], 'http://www.reuters.com/article/'\
 'us-usa-epa-pruitt-idUSKBN19W2D0')
crawler.parse(websites[2], 'https://www.brookings.edu/blog/'\
 'techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/')
crawler.parse(websites[3], 'https://www.nytimes.com/2018/01/'\
 '28/business/energy-environment/oil-boom.html')

url:https://www.brookings.edu/blog/techtank/2016/03/01/idea-to-retire-old-methods-of-policy-education/
title:Idea to Retire: Old methods of policy education
Idea to Retire: Old methods of policy education
body:
Public policy and public affairs schools aim to train competent creators and implementers of government policy. While drawing on the principles that gird our economic and political systems to provide a well-rounded education, like law schools and business schools, policy schools provide professional training. They are quite distinct from graduate programs in political science or economics which aim to train the next generation of academics. As professional training programs, they add value by imparting both the skills which are relevant to current employers, and skills which we know will be relevant as organizations and societies evolve. 
The relevance of the skills that policy programs impart to address problems of today and tomorrow bears further discussion. We are living thro

### 搜索抓去网站

In [326]:
import requests
from bs4 import BeautifulSoup

class Website:
    def __init__(self, name, url, searchUrl, resultListing,
             resultUrl, absoluteUrl, titleTag, bodyTag):
        self.name = name
        self.url = url
        self.searchUrl = searchUrl
        self.resultListing = resultListing
        self.resultUrl = resultUrl
        self.absoluteUrl=absoluteUrl
        self.titleTag = titleTag
        self.bodyTag = bodyTag

class Content:
    def __init__(self,topic,url,title,body):
        self.topic = topic
        self.url = url
        self.title = title
        self.body = body
        
    def print(self):
        print("New article found for topic: {}".format(self.topic))
        print("TITLE: {}".format(self.title))
        print("BODY:\n{}".format(self.body))
        print("URL: {}".format(self.url))
        
class Crawler:
    def getPage(self,url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    
    def safeGet(self, pageObj, selector):
        selectedItems = pageObj.select(selector)
        if selectedItems is not None and len(selectedItems) > 0:
            return selectedItems[0].get_text()
        return ''
    
    def search(self,topic,site):
        bs = self.getPage(site.searchUrl + topic)
        searchResults = bs.select(site.resultListing)
        for result in searchResults:
            url = result.select(site.resultUrl)[0].attrs['href']
            if(site.absoluteUrl):
                bs = self.getPage(url)
            else:
                bs = self.getPage(site.url + url)
            if bs is None:
                print("Something was wrong with that page or URL. Skipping!")
                return None
            title = self.safeGet(bs,site.titleTag)
            body = self.safeGet(bs,site.bodyTag)
            if title != '' and body != '':
                content = Content(topic,url,title,body)
                content.print()    

In [327]:
crawler = Crawler()
siteData = [
 ['O\'Reilly Media', 'http://oreilly.com',
     'https://ssearch.oreilly.com/?q=','article.product-result',
     'p.title a', True, 'h1', 'section#product-description'],
 ['Reuters', 'http://reuters.com',
     'http://www.reuters.com/search/news?blob=',
     'div.search-result-content','h3.search-result-title a',
     False, 'h1', 'div.StandardArticleBody_body_1gnLA'],
 ['Brookings', 'http://www.brookings.edu',
     'https://www.brookings.edu/search/?s=',
     'div.list-content article', 'h4.title a', True, 'h1',
     'div.post-body']
]
sites = []
for row in siteData:
    sites.append(Website(row[0], row[1], row[2],
                          row[3], row[4], row[5], row[6], row[7]))
    
topics = ['python', 'data science']
for topic in topics:
    print("GETTING INFO ABOUT: " + topic)
for targetSite in sites:
    crawler.search(topic, targetSite)

GETTING INFO ABOUT: python
GETTING INFO ABOUT: data science
New article found for topic: data science
TITLE: Reckoning with science, medicine, and scapegoating
BODY:

On Oct. 8, California became the first state to require Ethnic Studies courses for students in order to graduate from high school. All California high schools must offer Ethnic Studies beginning in the fall of 2025, and all students must complete one semester starting with the graduating class of 2030. In signing the bill, Gov. Gavin Newsom acknowledged that “America is shaped by our shared history, much of it painful and etched with woeful injustice.” Students “must understand our nation’s full history if we expect them to one day build a more just society.” 







Jennifer Lee

					Julian Clarence Levi Professor of Social Sciences - Columbia University 

					Robbert Dijkgraaf Member - Institute for Advanced Study 

 Twitter
JLeeSoc





This is long overdue. Ethnic Studies will help Californians understand how our pa

### 通过链接抓取网站

从每个网站的主页开始，定位内链，并解析在每个内链页面发现的内容。

In [353]:
import requests
from bs4 import BeautifulSoup

class Website:
    def __init__(self,name,url,targetPattern,absoluteUrl,titleTag,bodyTag):
        self.name = name
        self.url = url
        self.absoluteUrl = absoluteUrl
        self.targetPattern = targetPattern
        self.titleTag = titleTag
        self.bodyTag = bodyTag
        
class Content:
    def __init__(self,url,title,body):
        self.url = url
        self.title = title
        self.body = body
        
    def print(self):
        print('url:{}'.format(self.url))
        print('title:{}'.format(self.title))
        print('body:{}'.format(self.body))

class Crawler:
    '''
    进入主页，根据target pattern找内链，for循环进内链爬内容。
    '''
    def __init__(self, site):
        self.site = site
        self.visited = []
    
    def getPage(self,url):
        try:
            req = requests.get(url)
        except requests.exceptions.RequestException:
            return None
        return BeautifulSoup(req.text, 'html.parser')
    
    def safeGet(self, pageObj, selector):
        selectedElems = pageObj.select(selector)
        if selectedElems is not None and len(selectedElems) > 0:
            return '\n'.join([elem.get_text() for elem in selectedElems])
        return ''
        
    def parse(self, url):
        bs = self.getPage(url)
        if bs is not None:
            title = self.safeGet(bs,self.site.titleTag)
            body = self.safeGet(bs,self.site.bodyTag)
            if title != '' and body != '':
                content = Content(url, title, body)
                content.print()
                
    def crawl(self):
        bs = getPage(self.site.url)
        targetPages = bs.find_all('a', href=re.compile(self.site.targetPattern))
        for targetPage in targetPages:
            targetPage = targetPages.attrs['href']
            if targetPage not in self.visited:
                self.visited.append(targetPage)
                if self.site.absoluteUrl == False:
                    targetPage = '{}{}'.format(self.site.url, targetPage)
                self.parse(targetPage)

In [352]:
reuters = Website('Reuters', 'https://www.reuters.com', '^(/article/)', False,
         'h1', 'div.StandardArticleBody_body_1gnLA')
crawler = Crawler(reuters)
crawler.crawl()

类的学习

In [246]:
class GameMan:
    def __init__(self, name, gender, age, index):
        self.name = name
        self.gender = gender
        self.age = age
        self.index = index

    def grassland(self):
        self.index -= 200

    def practice(self):
        self.index += 100

    def incest(self):
        self.index -= 500

    def detail(self):
        """注释：当前对象的详细情况"""

        temp = "姓名:%s ; 性别:%s ; 年龄:%s ; 战斗力:%s" % (self.name, self.gender, self.age, self.index)
        print(temp)


In [247]:
Cang = GameMan('苍井井','女',18,1000)
Dong = GameMan('东尼木木','男',20,1800)
Bo = GameMan('波多多','女',19,2500)

In [248]:
Cang.grassland()
Cang.detail()

姓名:苍井井 ; 性别:女 ; 年龄:18 ; 战斗力:800


In [249]:
Dong.incest()
Dong.detail()

姓名:东尼木木 ; 性别:男 ; 年龄:20 ; 战斗力:1300


'set-top box'