# Web Scraping Multiple Pages

1. Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page: url ='https://en.wikipedia.org/wiki/Python'

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# 2. url: we start with the 'second' page. Show that you can start whenever you want
url = "https://en.wikipedia.org/wiki/Python"

In [3]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

In [5]:
# 4.2. check that the html code looks like it should
soup

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-typography-survey-disabled vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Python - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-typography-survey-disabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Python - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-w

In [12]:
soup.select("a")

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=C13_en.wikipedia.org&amp;uselang=en" title="Support us by donating to the Wikimedia Foundation"><span>Donate</span></a>,
 <a href=

In [14]:
soup.select("a")[0]["href"]

'#bodyContent'

In [15]:
# get links:
for link in soup.find_all('a'):
    print(link['href'])

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/w/index.php?title=Special:CreateAccount&returnto=Python
/w/index.php?title=Special:UserLogin&returnto=Python
/w/index.php?title=Special:CreateAccount&returnto=Python
/w/index.php?title=Special:UserLogin&returnto=Python
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#Snakes
#Computing
#People
#Roller_coasters
#Vehicles
#Weaponry
#Other_uses
#See_also
https://af.wikipedia.org/wiki/Python
https://als.wikipedia.org/wiki/Python
https://ar.wikipedia.org/wiki/%D8%A8%D8%A7%D9%8A%D

In [22]:
# https://stackoverflow.com/questions/3075550/how-can-i-get-href-links-from-html-using-python

for link in soup.find_all('a'):
    href = link.get('href')
    if href and (href.startswith('http') or href.startswith('www')):
        print(href)

https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
https://af.wikipedia.org/wiki/Python
https://als.wikipedia.org/wiki/Python
https://ar.wikipedia.org/wiki/%D8%A8%D8%A7%D9%8A%D8%AB%D9%88%D9%86_(%D8%AA%D9%88%D8%B6%D9%8A%D8%AD)
https://az.wikipedia.org/wiki/Python_(d%C9%99qiql%C9%99%C5%9Fdirm%C9%99)
https://bn.wikipedia.org/wiki/%E0%A6%AA%E0%A6%BE%E0%A6%87%E0%A6%A5%E0%A6%A8_(%E0%A6%A6%E0%A7%8D%E0%A6%AC%E0%A7%8D%E0%A6%AF%E0%A6%B0%E0%A7%8D%E0%A6%A5%E0%A6%A4%E0%A6%BE_%E0%A6%A8%E0%A6%BF%E0%A6%B0%E0%A6%B8%E0%A6%A8)
https://be.wikipedia.org/wiki/Python
https://bg.wikipedia.org/wiki/%D0%9F%D0%B8%D1%82%D0%BE%D0%BD_(%D0%BF%D0%BE%D1%8F%D1%81%D0%BD%D0%B5%D0%BD%D0%B8%D0%B5)
https://cs.wikipedia.org/wiki/Python_(rozcestn%C3%ADk)
https://da.wikipedia.org/wiki/Python
https://de.wikipedia.org/wiki/Python
https://eo.wikipedia.org/wiki/Pitono_(apartigilo)
https://eu.wikipedia.org/wiki/Python_(argipena)
https://fa

2. Find the number of titles that have changed in the United States Code since its last release point: url = 'http://uscode.house.gov/download/download.shtml'

In [23]:
# 1. url:
url = "http://uscode.house.gov/download/download.shtml"

In [24]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [25]:
soup = BeautifulSoup(response.content, "html.parser")

In [26]:
print(soup.prettify())

<?xml version='1.0' encoding='UTF-8' ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=8" http-equiv="X-UA-Compatible"/>
  <meta content="no-cache" http-equiv="pragma"/>
  <!-- HTTP 1.0 -->
  <meta content="no-cache,must-revalidate" http-equiv="cache-control"/>
  <!-- HTTP 1.1 -->
  <meta content="0" http-equiv="expires"/>
  <link href="/javax.faces.resource/favicon.ico.xhtml?ln=images" rel="shortcut icon"/>
  <link href="/javax.faces.resource/cssLayout.css.xhtml?ln=css" rel="stylesheet" type="text/css"/>
  <script src="/javax.faces.resource/jsf.js.xhtml?ln=javax.faces" type="text/javascript">
  </script>
  <link href="/javax.faces.resource/static.css.xhtml?ln=css" rel="stylesheet" type="text/css"/>
 </head>
 <body style="display:none;">
  <script src="/javax.faces.resour

In [27]:
# Titles in <b> bold </b> have been changed since the last release point.
# Ihave to output all titles in bold

In [37]:
bold_titles = soup.select('b')

for title in bold_titles:
    print(title.text)

bold
p
c
c


3. List all language names and number of related articles in the order they appear in wikipedia.org: url = 'https://www.wikipedia.org/'

In [43]:
# 1. url:
url = "https://www.wikipedia.org/"

In [44]:
# 2. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [45]:
soup = BeautifulSoup(response.content, "html.parser")

In [46]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Wikipedia
  </title>
  <meta content="Wikipedia is a free online encyclopedia, created and edited by volunteers around the world and hosted by the Wikimedia Foundation." name="description"/>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)no-js(\s|$)/, "$1js-enabled$2" );
  </script>
  <meta content="initial-scale=1,user-scalable=yes" name="viewport"/>
  <link href="/static/apple-touch/wikipedia.png" rel="apple-touch-icon"/>
  <link href="/static/favicon/wikipedia.ico" rel="shortcut icon"/>
  <link href="//creativecommons.org/licenses/by-sa/4.0/" rel="license"/>
  <style>
   .sprite{background-image:linear-gradient(transparent,transparent),url(portal/wikipedia.org/assets/img/sprite-8bb90067.svg);background-repeat:no-repeat;display:inline-block;vertical-align:middle}.svg-Commons-logo_sister{background-position:0 0;width:47px;height:47px}.svg

In [58]:
language_div = soup.select_one('div.styled-select.no-js div.hide-arrow select')
if language_div:
    language_options = language_div.find_all('option')
    language_names = [option.get_text(strip=True) for option in language_options]
    print(language_names)


['Afrikaans', 'Polski', 'العربية', 'Asturianu', 'Azərbaycanca', 'Български', 'বাংলা', 'Беларуская', 'Català', 'Čeština', 'Cymraeg', 'Dansk', 'Deutsch', 'Eesti', 'Ελληνικά', 'English', 'Español', 'Esperanto', 'Euskara', 'فارسی', 'Français', 'Galego', '한국어', 'हिन्दी', 'Hrvatski', 'Bahasa Indonesia', 'Italiano', 'עברית', 'ქართული', 'Latina', 'Latviešu', 'Lietuvių', 'Magyar', 'Македонски', 'مصرى', 'Bahasa Melayu', 'Bahaso Minangkabau', 'Nederlands', '日本語', 'Norsk (bokmål)', 'Norsk (nynorsk)', 'Oʻzbekcha / Ўзбекча', 'Português', 'Қазақша / Qazaqşa / قازاقشا', 'Română', 'Simple English', 'Sinugboanong Binisaya', 'Slovenčina', 'Slovenščina', 'Српски / Srpski', 'Srpskohrvatski / Српскохрватски', 'Suomi', 'Svenska', 'தமிழ்', 'Татарча / Tatarça', 'ภาษาไทย', 'Тоҷикӣ', 'تۆرکجه', 'Türkçe', 'Українська', 'اردو', 'Tiếng Việt', 'Winaray', '中文', 'Русский', 'Нохчийн', 'Հայերեն', 'မြန်မာဘာသာ']
