In [1]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd

__Send a get request__

In [3]:
result = requests.get('https://www.scrapethissite.com/pages/')

In [4]:
result

<Response [200]>

In [6]:
result.url

'https://www.scrapethissite.com/pages/'

__content__

In [8]:
src = result.content

In [14]:
src

b'<!doctype html>\n<html lang="en">\n  <head>\n    <meta charset="utf-8">\n    <title>Learn Web Scraping | Scrape This Site | A public sandbox for learning web scraping</title>\n    <link rel="icon" type="image/png" href="/static/images/scraper-icon.png" />\n\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <meta name="description" content="Here are some practice pages you can scrape.">\n\n    <link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" rel="stylesheet" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" crossorigin="anonymous">\n    <link href=\'https://fonts.googleapis.com/css?family=Lato:400,700\' rel=\'stylesheet\' type=\'text/css\'>\n    <link rel="stylesheet" type="text/css" href="/static/css/styles.css">\n\n    \n\n  </head>\n\n  <body>\n    <nav id="site-nav">\n            <div class="container">\n    

__parse__

In [11]:
soup = BeautifulSoup(src, 'lxml')

In [13]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Learn Web Scraping | Scrape This Site | A public sandbox for learning web scraping
  </title>
  <link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="Here are some practice pages you can scrape." name="description"/>
  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
  <link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
 </head>
 <body>
  <nav id="site-nav">
   <div class="container">
    <div class="col-md-12">
     <ul class="nav nav-tabs">
  

In [15]:
# title
soup.title

<title>Learn Web Scraping | Scrape This Site | A public sandbox for learning web scraping</title>

In [16]:
soup.title.text

'Learn Web Scraping | Scrape This Site | A public sandbox for learning web scraping'

In [17]:
# first link tag
soup.a

<a class="nav-link hidden-sm hidden-xs" href="/">
<img id="nav-logo" src="/static/images/scraper-icon.png"/>
                                Scrape This Site
                            </a>

In [19]:
first_link = soup.find('a')

In [26]:
first_link.text.strip()

'Scrape This Site'

In [22]:
# attrs
first_link.attrs

{'href': '/', 'class': ['nav-link', 'hidden-sm', 'hidden-xs']}

In [27]:
# to extract image-src inside first_link
first_link.img

<img id="nav-logo" src="/static/images/scraper-icon.png"/>

In [29]:
first_link.find('img')['src']

'/static/images/scraper-icon.png'

__Find an elment using attrs__

In [32]:
soup.find(name='h3', attrs={'class': 'page-title'})

<h3 class="page-title">
<a href="/pages/simple/">Countries of the World: A Simple Example</a>
</h3>

In [33]:
soup.find(name='h3', attrs={'class': 'page-title'}).text

'\nCountries of the World: A Simple Example\n'

In [35]:
# find all h3 tags
headers = soup.find_all(name='h3', attrs={'class': 'page-title'})

In [36]:
for header in headers:
    print(header)

<h3 class="page-title">
<a href="/pages/simple/">Countries of the World: A Simple Example</a>
</h3>
<h3 class="page-title">
<a href="/pages/forms/">Hockey Teams: Forms, Searching and Pagination</a>
</h3>
<h3 class="page-title">
<a href="/pages/ajax-javascript/">Oscar Winning Films: AJAX and Javascript</a>
</h3>
<h3 class="page-title">
<a href="/pages/frames/">Turtles All the Way Down: Frames &amp; iFrames</a>
</h3>
<h3 class="page-title">
<a href="/pages/advanced/">Advanced Topics: Real World Challenges You'll Encounter</a>
</h3>


In [38]:
for header in headers:
    print(header.text.strip())

Countries of the World: A Simple Example
Hockey Teams: Forms, Searching and Pagination
Oscar Winning Films: AJAX and Javascript
Turtles All the Way Down: Frames & iFrames
Advanced Topics: Real World Challenges You'll Encounter


In [40]:
# links tags inside the headers
for header in headers:
    print(header.a)

<a href="/pages/simple/">Countries of the World: A Simple Example</a>
<a href="/pages/forms/">Hockey Teams: Forms, Searching and Pagination</a>
<a href="/pages/ajax-javascript/">Oscar Winning Films: AJAX and Javascript</a>
<a href="/pages/frames/">Turtles All the Way Down: Frames &amp; iFrames</a>
<a href="/pages/advanced/">Advanced Topics: Real World Challenges You'll Encounter</a>


In [43]:
# URL inside the headers
for header in headers:
    print(header.a['href'])

/pages/simple/
/pages/forms/
/pages/ajax-javascript/
/pages/frames/
/pages/advanced/


__parags__

In [45]:
parags = soup.find_all('p', {'class': 'lead session-desc'})

In [46]:
for p in parags:
    print(p)

<p class="lead session-desc">
                                A single page that lists information about all the countries in the world. Good for those just get started with web scraping.
                            </p>
<p class="lead session-desc">
                                Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
                            </p>
<p class="lead session-desc">
                                Click through a bunch of great films. Learn how content is added to the page asynchronously with Javascript and how you can scrape it.
                            </p>
<p class="lead session-desc">
                                Some older sites might still use frames to break up thier pages. Modern ones might be using iFrames to expose data. Learn about turtles as you scrape content inside frames.
                            </p>
<p class="lead session-desc">
                      

In [49]:
# content (desc)

for p in parags:
    print(p.text.strip())
    print('*' * 20)

A single page that lists information about all the countries in the world. Good for those just get started with web scraping.
********************
Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components.
********************
Click through a bunch of great films. Learn how content is added to the page asynchronously with Javascript and how you can scrape it.
********************
Some older sites might still use frames to break up thier pages. Modern ones might be using iFrames to expose data. Learn about turtles as you scrape content inside frames.
********************
Scraping real websites, you're likely run into a number of common gotchas. Get practice with spoofing headers, handling logins & session cookies, finding CSRF tokens, and other common network errors.
********************


### Countries

In [51]:
result = requests.get('https://www.scrapethissite.com/pages/simple/')

In [52]:
result

<Response [200]>

In [55]:
# page source
src = result.content

In [57]:
# parse
soup = BeautifulSoup(src, 'lxml')

In [58]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping
  </title>
  <link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <meta content="A single page that lists information about all the countries in the world. Good for those just get started with web scraping." name="description"/>
  <link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
  <link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
  <meta content="noindex

In [59]:
# title
soup.title

<title>Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping</title>

In [60]:
soup.title.text

'Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping'

__country name__

In [61]:
soup.find('h3', {'class': 'country-name'})

<h3 class="country-name">
<i class="flag-icon flag-icon-ad"></i>
                            Andorra
                        </h3>

In [63]:
soup.find('h3', {'class': 'country-name'}).text.strip()

'Andorra'

In [65]:
# all names
names = soup.find_all('h3', {'class': 'country-name'})

In [68]:
for name in names:
    print(name.text.strip())

Andorra
United Arab Emirates
Afghanistan
Antigua and Barbuda
Anguilla
Albania
Armenia
Angola
Antarctica
Argentina
American Samoa
Austria
Australia
Aruba
Åland
Azerbaijan
Bosnia and Herzegovina
Barbados
Bangladesh
Belgium
Burkina Faso
Bulgaria
Bahrain
Burundi
Benin
Saint Barthélemy
Bermuda
Brunei
Bolivia
Bonaire
Brazil
Bahamas
Bhutan
Bouvet Island
Botswana
Belarus
Belize
Canada
Cocos [Keeling] Islands
Democratic Republic of the Congo
Central African Republic
Republic of the Congo
Switzerland
Ivory Coast
Cook Islands
Chile
Cameroon
China
Colombia
Costa Rica
Cuba
Cape Verde
Curacao
Christmas Island
Cyprus
Czech Republic
Germany
Djibouti
Denmark
Dominica
Dominican Republic
Algeria
Ecuador
Estonia
Egypt
Western Sahara
Eritrea
Spain
Ethiopia
Finland
Fiji
Falkland Islands
Micronesia
Faroe Islands
France
Gabon
United Kingdom
Grenada
Georgia
French Guiana
Guernsey
Ghana
Gibraltar
Greenland
Gambia
Guinea
Guadeloupe
Equatorial Guinea
Greece
South Georgia and the South Sandwich Islands
Guatemala
G

__dict__

In [69]:
countries_info = {'name': []}

In [70]:
countries_info

{'name': []}

In [73]:
for name in names:
    name = name.text.strip()  # clean --> text
    countries_info['name'].append(name)  # append to name-key

In [75]:
countries_info['name']

['Andorra',
 'United Arab Emirates',
 'Afghanistan',
 'Antigua and Barbuda',
 'Anguilla',
 'Albania',
 'Armenia',
 'Angola',
 'Antarctica',
 'Argentina',
 'American Samoa',
 'Austria',
 'Australia',
 'Aruba',
 'Åland',
 'Azerbaijan',
 'Bosnia and Herzegovina',
 'Barbados',
 'Bangladesh',
 'Belgium',
 'Burkina Faso',
 'Bulgaria',
 'Bahrain',
 'Burundi',
 'Benin',
 'Saint Barthélemy',
 'Bermuda',
 'Brunei',
 'Bolivia',
 'Bonaire',
 'Brazil',
 'Bahamas',
 'Bhutan',
 'Bouvet Island',
 'Botswana',
 'Belarus',
 'Belize',
 'Canada',
 'Cocos [Keeling] Islands',
 'Democratic Republic of the Congo',
 'Central African Republic',
 'Republic of the Congo',
 'Switzerland',
 'Ivory Coast',
 'Cook Islands',
 'Chile',
 'Cameroon',
 'China',
 'Colombia',
 'Costa Rica',
 'Cuba',
 'Cape Verde',
 'Curacao',
 'Christmas Island',
 'Cyprus',
 'Czech Republic',
 'Germany',
 'Djibouti',
 'Denmark',
 'Dominica',
 'Dominican Republic',
 'Algeria',
 'Ecuador',
 'Estonia',
 'Egypt',
 'Western Sahara',
 'Eritrea',
 

__info_keys__

In [76]:
soup.find('div', {'class': 'country-info'})

<div class="country-info">
<strong>Capital:</strong> <span class="country-capital">Andorra la Vella</span><br/>
<strong>Population:</strong> <span class="country-population">84000</span><br/>
<strong>Area (km<sup>2</sup>):</strong> <span class="country-area">468.0</span><br/>
</div>

In [77]:
first_country_info = soup.find('div', {'class': 'country-info'})

In [78]:
first_country_info.strong

<strong>Capital:</strong>

In [80]:
first_country_info.strong.text.replace(':', '')

'Capital'

In [82]:
info_keys = first_country_info.find_all('strong')

In [83]:
for key in info_keys:
    print(key)

<strong>Capital:</strong>
<strong>Population:</strong>
<strong>Area (km<sup>2</sup>):</strong>


In [89]:
for key in info_keys:
    print(key.text.replace(':', ''))

Capital
Population
Area (km2)


In [85]:
countries_info.keys()

dict_keys(['name'])

In [86]:
# add the other info_keys to the countries dict
for key in info_keys:
    key = key.text.replace(':', '')
    countries_info[key] = []  # append the info_keys to dict

In [90]:
countries_info.keys()

dict_keys(['name', 'Capital', 'Population', 'Area (km2)'])

__Capital__

In [92]:
soup.find('span', {'class': 'country-capital'}).text

'Andorra la Vella'

In [94]:
capitals = soup.find_all('span', {'class': 'country-capital'})

In [111]:
for capital in capitals:
    print(capital.text)

Andorra la Vella
Abu Dhabi
Kabul
St. John's
The Valley
Tirana
Yerevan
Luanda
None
Buenos Aires
Pago Pago
Vienna
Canberra
Oranjestad
Mariehamn
Baku
Sarajevo
Bridgetown
Dhaka
Brussels
Ouagadougou
Sofia
Manama
Bujumbura
Porto-Novo
Gustavia
Hamilton
Bandar Seri Begawan
Sucre
Kralendijk
Brasília
Nassau
Thimphu
None
Gaborone
Minsk
Belmopan
Ottawa
West Island
Kinshasa
Bangui
Brazzaville
Bern
Yamoussoukro
Avarua
Santiago
Yaoundé
Beijing
Bogotá
San José
Havana
Praia
Willemstad
Flying Fish Cove
Nicosia
Prague
Berlin
Djibouti
Copenhagen
Roseau
Santo Domingo
Algiers
Quito
Tallinn
Cairo
Laâyoune / El Aaiún
Asmara
Madrid
Addis Ababa
Helsinki
Suva
Stanley
Palikir
Tórshavn
Paris
Libreville
London
St. George's
Tbilisi
Cayenne
St Peter Port
Accra
Gibraltar
Nuuk
Bathurst
Conakry
Basse-Terre
Malabo
Athens
Grytviken
Guatemala City
Hagåtña
Bissau
Georgetown
Hong Kong
None
Tegucigalpa
Zagreb
Port-au-Prince
Budapest
Jakarta
Dublin
None
Douglas
New Delhi
None
Baghdad
Tehran
Reykjavik
Rome
Saint Helier
Kingston

In [98]:
for capital in capitals:
    capital = capital.text
    countries_info['Capital'].append(capital)

In [99]:
countries_info['Capital']

['Andorra la Vella',
 'Abu Dhabi',
 'Kabul',
 "St. John's",
 'The Valley',
 'Tirana',
 'Yerevan',
 'Luanda',
 'None',
 'Buenos Aires',
 'Pago Pago',
 'Vienna',
 'Canberra',
 'Oranjestad',
 'Mariehamn',
 'Baku',
 'Sarajevo',
 'Bridgetown',
 'Dhaka',
 'Brussels',
 'Ouagadougou',
 'Sofia',
 'Manama',
 'Bujumbura',
 'Porto-Novo',
 'Gustavia',
 'Hamilton',
 'Bandar Seri Begawan',
 'Sucre',
 'Kralendijk',
 'Brasília',
 'Nassau',
 'Thimphu',
 'None',
 'Gaborone',
 'Minsk',
 'Belmopan',
 'Ottawa',
 'West Island',
 'Kinshasa',
 'Bangui',
 'Brazzaville',
 'Bern',
 'Yamoussoukro',
 'Avarua',
 'Santiago',
 'Yaoundé',
 'Beijing',
 'Bogotá',
 'San José',
 'Havana',
 'Praia',
 'Willemstad',
 'Flying Fish Cove',
 'Nicosia',
 'Prague',
 'Berlin',
 'Djibouti',
 'Copenhagen',
 'Roseau',
 'Santo Domingo',
 'Algiers',
 'Quito',
 'Tallinn',
 'Cairo',
 'Laâyoune / El Aaiún',
 'Asmara',
 'Madrid',
 'Addis Ababa',
 'Helsinki',
 'Suva',
 'Stanley',
 'Palikir',
 'Tórshavn',
 'Paris',
 'Libreville',
 'London',
 "

__population__

In [100]:
soup.find('span', {'class': 'country-population'})

<span class="country-population">84000</span>

In [102]:
float(soup.find('span', {'class': 'country-population'}).text)

84000.0

In [103]:
# all populations
pops = soup.find_all('span', {'class': 'country-population'})

In [114]:
for pop in pops:
    print(float(pop.text))

84000.0
4975593.0
29121286.0
86754.0
13254.0
2986952.0
2968000.0
13068161.0
0.0
41343201.0
57881.0
8205000.0
21515754.0
71566.0
26711.0
8303512.0
4590000.0
285653.0
156118464.0
10403000.0
16241811.0
7148785.0
738004.0
9863117.0
9056010.0
8450.0
65365.0
395027.0
9947418.0
18012.0
201103330.0
301790.0
699847.0
0.0
2029307.0
9685000.0
314522.0
33679000.0
628.0
70916439.0
4844927.0
3039126.0
7581000.0
21058798.0
21388.0
16746491.0
19294149.0
1330044000.0
47790000.0
4516220.0
11423000.0
508659.0
141766.0
1500.0
1102677.0
10476000.0
81802257.0
740528.0
5484000.0
72813.0
9823821.0
34586184.0
14790608.0
1291170.0
80471869.0
273008.0
5792984.0
46505963.0
88013491.0
5244000.0
875983.0
2638.0
107708.0
48228.0
64768389.0
1545255.0
62348447.0
107818.0
4630000.0
195506.0
65228.0
24339838.0
27884.0
56375.0
1593256.0
10324025.0
443000.0
1014999.0
11000000.0
30.0
13550440.0
159358.0
1565126.0
748486.0
6898686.0
0.0
7989415.0
4491000.0
9648924.0
9982000.0
242968342.0
4622917.0
7353985.0
75049.0
11731080

In [107]:
countries_info.keys()

dict_keys(['name', 'Capital', 'Population', 'Area (km2)'])

In [108]:
for pop in pops:
    pop = float(pop.text)
    countries_info['Population'].append(pop)

In [109]:
countries_info['Population']

[84000.0,
 4975593.0,
 29121286.0,
 86754.0,
 13254.0,
 2986952.0,
 2968000.0,
 13068161.0,
 0.0,
 41343201.0,
 57881.0,
 8205000.0,
 21515754.0,
 71566.0,
 26711.0,
 8303512.0,
 4590000.0,
 285653.0,
 156118464.0,
 10403000.0,
 16241811.0,
 7148785.0,
 738004.0,
 9863117.0,
 9056010.0,
 8450.0,
 65365.0,
 395027.0,
 9947418.0,
 18012.0,
 201103330.0,
 301790.0,
 699847.0,
 0.0,
 2029307.0,
 9685000.0,
 314522.0,
 33679000.0,
 628.0,
 70916439.0,
 4844927.0,
 3039126.0,
 7581000.0,
 21058798.0,
 21388.0,
 16746491.0,
 19294149.0,
 1330044000.0,
 47790000.0,
 4516220.0,
 11423000.0,
 508659.0,
 141766.0,
 1500.0,
 1102677.0,
 10476000.0,
 81802257.0,
 740528.0,
 5484000.0,
 72813.0,
 9823821.0,
 34586184.0,
 14790608.0,
 1291170.0,
 80471869.0,
 273008.0,
 5792984.0,
 46505963.0,
 88013491.0,
 5244000.0,
 875983.0,
 2638.0,
 107708.0,
 48228.0,
 64768389.0,
 1545255.0,
 62348447.0,
 107818.0,
 4630000.0,
 195506.0,
 65228.0,
 24339838.0,
 27884.0,
 56375.0,
 1593256.0,
 10324025.0,
 443

__Area__

In [115]:
soup.find('span', {'class': 'country-area'})

<span class="country-area">468.0</span>

In [117]:
float(soup.find('span', {'class': 'country-area'}).text)

468.0

In [120]:
# all areas
areas = soup.find_all('span', {'class': 'country-area'})

In [121]:
for area in areas:
    print(area.text)

468.0
82880.0
647500.0
443.0
102.0
28748.0
29800.0
1246700.0
1.4E7
2766890.0
199.0
83858.0
7686850.0
193.0
1580.0
86600.0
51129.0
431.0
144000.0
30510.0
274200.0
110910.0
665.0
27830.0
112620.0
21.0
53.0
5770.0
1098580.0
328.0
8511965.0
13940.0
47000.0
49.0
600370.0
207600.0
22966.0
9984670.0
14.0
2345410.0
622984.0
342000.0
41290.0
322460.0
240.0
756950.0
475440.0
9596960.0
1138910.0
51100.0
110860.0
4033.0
444.0
135.0
9250.0
78866.0
357021.0
23000.0
43094.0
754.0
48730.0
2381740.0
283560.0
45226.0
1001450.0
266000.0
121320.0
504782.0
1127127.0
337030.0
18270.0
12173.0
702.0
1399.0
547030.0
267667.0
244820.0
344.0
69700.0
91000.0
78.0
239460.0
6.5
2166086.0
11300.0
245857.0
1780.0
28051.0
131940.0
3903.0
108890.0
549.0
36120.0
214970.0
1092.0
412.0
112090.0
56542.0
27750.0
93030.0
1919440.0
70280.0
20770.0
572.0
3287590.0
60.0
437072.0
1648000.0
103000.0
301230.0
116.0
10991.0
92300.0
377835.0
582650.0
198500.0
181040.0
811.0
2170.0
261.0
120540.0
98480.0
17820.0
262.0
2717300.0
23680

In [124]:
for area in areas:
    area = area.text
    countries_info['Area (km2)'].append(area)

In [122]:
countries_info.keys()

dict_keys(['name', 'Capital', 'Population', 'Area (km2)'])

In [125]:
countries_info['Area (km2)']

['468.0',
 '82880.0',
 '647500.0',
 '443.0',
 '102.0',
 '28748.0',
 '29800.0',
 '1246700.0',
 '1.4E7',
 '2766890.0',
 '199.0',
 '83858.0',
 '7686850.0',
 '193.0',
 '1580.0',
 '86600.0',
 '51129.0',
 '431.0',
 '144000.0',
 '30510.0',
 '274200.0',
 '110910.0',
 '665.0',
 '27830.0',
 '112620.0',
 '21.0',
 '53.0',
 '5770.0',
 '1098580.0',
 '328.0',
 '8511965.0',
 '13940.0',
 '47000.0',
 '49.0',
 '600370.0',
 '207600.0',
 '22966.0',
 '9984670.0',
 '14.0',
 '2345410.0',
 '622984.0',
 '342000.0',
 '41290.0',
 '322460.0',
 '240.0',
 '756950.0',
 '475440.0',
 '9596960.0',
 '1138910.0',
 '51100.0',
 '110860.0',
 '4033.0',
 '444.0',
 '135.0',
 '9250.0',
 '78866.0',
 '357021.0',
 '23000.0',
 '43094.0',
 '754.0',
 '48730.0',
 '2381740.0',
 '283560.0',
 '45226.0',
 '1001450.0',
 '266000.0',
 '121320.0',
 '504782.0',
 '1127127.0',
 '337030.0',
 '18270.0',
 '12173.0',
 '702.0',
 '1399.0',
 '547030.0',
 '267667.0',
 '244820.0',
 '344.0',
 '69700.0',
 '91000.0',
 '78.0',
 '239460.0',
 '6.5',
 '2166086.0

In [126]:
countries_info

{'name': ['Andorra',
  'United Arab Emirates',
  'Afghanistan',
  'Antigua and Barbuda',
  'Anguilla',
  'Albania',
  'Armenia',
  'Angola',
  'Antarctica',
  'Argentina',
  'American Samoa',
  'Austria',
  'Australia',
  'Aruba',
  'Åland',
  'Azerbaijan',
  'Bosnia and Herzegovina',
  'Barbados',
  'Bangladesh',
  'Belgium',
  'Burkina Faso',
  'Bulgaria',
  'Bahrain',
  'Burundi',
  'Benin',
  'Saint Barthélemy',
  'Bermuda',
  'Brunei',
  'Bolivia',
  'Bonaire',
  'Brazil',
  'Bahamas',
  'Bhutan',
  'Bouvet Island',
  'Botswana',
  'Belarus',
  'Belize',
  'Canada',
  'Cocos [Keeling] Islands',
  'Democratic Republic of the Congo',
  'Central African Republic',
  'Republic of the Congo',
  'Switzerland',
  'Ivory Coast',
  'Cook Islands',
  'Chile',
  'Cameroon',
  'China',
  'Colombia',
  'Costa Rica',
  'Cuba',
  'Cape Verde',
  'Curacao',
  'Christmas Island',
  'Cyprus',
  'Czech Republic',
  'Germany',
  'Djibouti',
  'Denmark',
  'Dominica',
  'Dominican Republic',
  'Algeri

__Make a df__ 

In [128]:
df = pd.DataFrame(countries_info)

In [129]:
df

Unnamed: 0,name,Capital,Population,Area (km2)
0,Andorra,Andorra la Vella,84000.0,468.0
1,United Arab Emirates,Abu Dhabi,4975593.0,82880.0
2,Afghanistan,Kabul,29121286.0,647500.0
3,Antigua and Barbuda,St. John's,86754.0,443.0
4,Anguilla,The Valley,13254.0,102.0
...,...,...,...,...
245,Yemen,Sanaa,23495361.0,527970.0
246,Mayotte,Mamoudzou,159042.0,374.0
247,South Africa,Pretoria,49000000.0,1219912.0
248,Zambia,Lusaka,13460305.0,752614.0


In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        250 non-null    object 
 1   Capital     250 non-null    object 
 2   Population  250 non-null    float64
 3   Area (km2)  250 non-null    object 
dtypes: float64(1), object(3)
memory usage: 7.9+ KB


In [132]:
df.rename(columns={'name': 'Name'}, inplace=True)

In [133]:
df.head(3)

Unnamed: 0,Name,Capital,Population,Area (km2)
0,Andorra,Andorra la Vella,84000.0,468.0
1,United Arab Emirates,Abu Dhabi,4975593.0,82880.0
2,Afghanistan,Kabul,29121286.0,647500.0


In [134]:
import matplotlib.pyplot as plt

In [136]:
# plt.scatter(df['Area (km2)'], df.Population)

### Table

In [139]:
result = requests.get('https://www.scrapethissite.com/pages/forms/?page_num=1')

In [141]:
src = result.content

In [142]:
soup = BeautifulSoup(src, 'lxml')

In [143]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Hockey Teams: Forms, Searching and Pagination | Scrape This Site | A public sandbox for learning web scraping</title>
<link href="/static/images/scraper-icon.png" rel="icon" type="image/png"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<meta content="Browse through a database of NHL team stats since 1990. Practice building a scraper that handles common website interface components." name="description"/>
<link crossorigin="anonymous" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css" integrity="sha256-MfvZlkHCEqatNoGiOXveE8FIwMzZg4W85qfrfIFBfYc= sha512-dTfge/zgoMYpP7QbHy4gWMEGsbsdZeCXz7irItjcC3sPUFtf0kuFbDz/ixG7ArTxmDjLXDmezHubeNikyKGVyQ==" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lato:400,700" rel="stylesheet" type="text/css"/>
<link href="/static/css/styles.css" rel="stylesheet" type="text/css"/>
<meta content="noindex" name="robot

__th__

In [145]:
ths = soup.find_all('th')

In [146]:
# table headers tags
for th in ths:
    print(th)

<th>
                            Team Name
                        </th>
<th>
                            Year
                        </th>
<th>
                            Wins
                        </th>
<th>
                            Losses
                        </th>
<th>
                            OT Losses
                        </th>
<th>
                            Win %
                        </th>
<th>
                            Goals For (GF)
                        </th>
<th>
                            Goals Against (GA)
                        </th>
<th>
                            + / -
                        </th>


In [148]:
# table headers text
for th in ths:
    print(th.text.strip())

Team Name
Year
Wins
Losses
OT Losses
Win %
Goals For (GF)
Goals Against (GA)
+ / -


In [149]:
table_dict = {}

In [150]:
for th in ths:
    header_key = th.text.strip()
    table_dict[header_key] = []

In [151]:
table_dict

{'Team Name': [],
 'Year': [],
 'Wins': [],
 'Losses': [],
 'OT Losses': [],
 'Win %': [],
 'Goals For (GF)': [],
 'Goals Against (GA)': [],
 '+ / -': []}

__table structure in HTML__
.tr 
     .th
.trs
    .tds

In [154]:
rows = soup.find_all('tr')[1:]

In [157]:
first_row = rows[0]

In [160]:
for cell in first_row.find_all('td'):
    print(cell)

<td class="name">
                            Boston Bruins
                        </td>
<td class="year">
                            1990
                        </td>
<td class="wins">
                            44
                        </td>
<td class="losses">
                            24
                        </td>
<td class="ot-losses">
</td>
<td class="pct text-success">
                            0.55
                        </td>
<td class="gf">
                            299
                        </td>
<td class="ga">
                            264
                        </td>
<td class="diff text-success">
                            35
                        </td>


In [162]:
for cell in first_row.find_all('td'):
    print(cell.text.strip())

Boston Bruins
1990
44
24

0.55
299
264
35


In [164]:
# all rows
for row in rows:
    cells = row.find_all('td')  # all cells in row
    for cell in cells:
        print(cell.text.strip())

Boston Bruins
1990
44
24

0.55
299
264
35
Buffalo Sabres
1990
31
30

0.388
292
278
14
Calgary Flames
1990
46
26

0.575
344
263
81
Chicago Blackhawks
1990
49
23

0.613
284
211
73
Detroit Red Wings
1990
34
38

0.425
273
298
-25
Edmonton Oilers
1990
37
37

0.463
272
272
0
Hartford Whalers
1990
31
38

0.388
238
276
-38
Los Angeles Kings
1990
46
24

0.575
340
254
86
Minnesota North Stars
1990
27
39

0.338
256
266
-10
Montreal Canadiens
1990
39
30

0.487
273
249
24
New Jersey Devils
1990
32
33

0.4
272
264
8
New York Islanders
1990
25
45

0.312
223
290
-67
New York Rangers
1990
36
31

0.45
297
265
32
Philadelphia Flyers
1990
33
37

0.412
252
267
-15
Pittsburgh Penguins
1990
41
33

0.512
342
305
37
Quebec Nordiques
1990
16
50

0.2
236
354
-118
St. Louis Blues
1990
47
22

0.588
310
250
60
Toronto Maple Leafs
1990
23
46

0.287
241
318
-77
Vancouver Canucks
1990
28
43

0.35
243
315
-72
Washington Capitals
1990
37
36

0.463
258
258
0
Winnipeg Jets
1990
26
43

0.325
260
288
-28
Boston Bruins
1991
