In [1]:
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>Scraping tutorial 1 | 莫烦Python</title>
	<link rel="icon" href="{{ site_url }}/static/img/description/tab_icon.png">
</head>
<body>
	<h1>爬虫测试1</h1>
	<p>
		这是一个在 <a href="{{ site_url }}/">莫烦Python</a>
		<a href="{{ site_url }}/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>

</body>
</html>


In [2]:
import re
res = re.findall(r"<title>(.+?)</title>", html)
print("\nPage title is: ", res[0])
# Page title is:  Scraping tutorial 1 | 莫烦Python


Page title is:  Scraping tutorial 1 | 莫烦Python


In [3]:
res = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL)    # re.DOTALL if multi line
print("\nPage paragraph is: ", res[0])
# Page paragraph is:
#     这是一个在 <a href="https://mofanpy.com/">莫烦Python</a>
#     <a href="https://mofanpy.com/tutorials/scraping">爬虫教程</a> 中的简单测试.


Page paragraph is:  
		这是一个在 <a href="{{ site_url }}/">莫烦Python</a>
		<a href="{{ site_url }}/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	


In [5]:
res = re.findall(r'href="(.*?)"', html)
print("\nAll links: ", res)
# All links:  ['https://mofanpy.com/static/img/description/tab_icon.png', 'https://mofanpy.com/', 'https://mofanpy.com/tutorials/scraping']


All links:  ['{{ site_url }}/static/img/description/tab_icon.png', '{{ site_url }}/', '{{ site_url }}/tutorials/data-manipulation/scraping/']


In [7]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://mofanpy.com/static/scraping/basic-structure.html").read().decode('utf-8')
print(html)

<!DOCTYPE html>
<html lang="cn">
<head>
	<meta charset="UTF-8">
	<title>Scraping tutorial 1 | 莫烦Python</title>
	<link rel="icon" href="{{ site_url }}/static/img/description/tab_icon.png">
</head>
<body>
	<h1>爬虫测试1</h1>
	<p>
		这是一个在 <a href="{{ site_url }}/">莫烦Python</a>
		<a href="{{ site_url }}/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>

</body>
</html>


In [8]:
soup = BeautifulSoup(html, features='lxml')
print(soup.h1)
print('\n', soup.p)

<h1>爬虫测试1</h1>

 <p>
		这是一个在 <a href="{{ site_url }}/">莫烦Python</a>
<a href="{{ site_url }}/tutorials/data-manipulation/scraping/">爬虫教程</a> 中的简单测试.
	</p>


In [9]:
all_href = soup.find_all('a')
print(all_href)
all_href = [l['href'] for l in all_href]
print('\n', all_href)

[<a href="{{ site_url }}/">莫烦Python</a>, <a href="{{ site_url }}/tutorials/data-manipulation/scraping/">爬虫教程</a>]

 ['{{ site_url }}/', '{{ site_url }}/tutorials/data-manipulation/scraping/']


In [10]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://mofanpy.com/static/scraping/list.html").read().decode('utf-8')


In [11]:
soup = BeautifulSoup(html, features='lxml')

# use class to narrow search
month = soup.find_all('li', {"class": "month"})
for m in month:
    print(m.get_text())


一月
二月
三月
四月
五月


In [12]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

# if has Chinese, apply decode()
html = urlopen("https://mofanpy.com/static/scraping/table.html").read().decode('utf-8')

soup = BeautifulSoup(html, features='lxml')

# print with title
for item in soup.find("table", {"id": "course-list"}).children:
    print(item)



<tr>
<th>
			分类
		</th><th>
			名字
		</th><th>
			时长
		</th><th>
			预览
		</th>
</tr>


<tr class="ml" id="course1">
<td>
			机器学习
		</td><td>
<a href="/tutorials/machine-learning/tensorflow/">
				Tensorflow 神经网络</a>
</td><td>
			2:00
		</td><td>
<img src="/static/img/course_cover/tf.jpg"/>
</td>
</tr>


<tr class="ml" id="course2">
<td>
			机器学习
		</td><td>
<a href="/tutorials/machine-learning/reinforcement-learning/">
				强化学习</a>
</td><td>
			5:00
		</td><td>
<img src="/static/img/course_cover/rl.jpg"/>
</td>
</tr>


<tr class="data" id="course3">
<td>
			数据处理
		</td><td>
<a href="/tutorials/data-manipulation/scraping/">
				爬虫</a>
</td><td>
			3:00
		</td><td>
<img src="/static/img/course_cover/scraping.jpg"/>
</td>
</tr>




In [13]:
print("-------------------------")
# print without title
for item in soup.find("table", {"id": "course-list"}).tr.next_siblings:
    print(item)


-------------------------


<tr class="ml" id="course1">
<td>
			机器学习
		</td><td>
<a href="/tutorials/machine-learning/tensorflow/">
				Tensorflow 神经网络</a>
</td><td>
			2:00
		</td><td>
<img src="/static/img/course_cover/tf.jpg"/>
</td>
</tr>


<tr class="ml" id="course2">
<td>
			机器学习
		</td><td>
<a href="/tutorials/machine-learning/reinforcement-learning/">
				强化学习</a>
</td><td>
			5:00
		</td><td>
<img src="/static/img/course_cover/rl.jpg"/>
</td>
</tr>


<tr class="data" id="course3">
<td>
			数据处理
		</td><td>
<a href="/tutorials/data-manipulation/scraping/">
				爬虫</a>
</td><td>
			3:00
		</td><td>
<img src="/static/img/course_cover/scraping.jpg"/>
</td>
</tr>




In [16]:
print("-------------------------")
# navigate using next_sibling/previous_sibling
print(soup.find("img", {"src": "/static/img/course_cover/scraping.jpg"}).parent.previous_sibling.get_text())

-------------------------

			3:00
		
