In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

# Web Scraper

In [3]:
from bs4 import BeautifulSoup
import requests

Require input is the URL.

In [4]:
url = 'https://www.britannica.com/topic/list-of-state-capitals-in-the-United-States-2119210'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')
print(soup)

<!DOCTYPE html>
<html class="topic-desktop ui-unknown0 ui-unknown" lang="en">
<head prefix="og: https://ogp.me/ns# fb: https://ogp.me/ns/fb#">
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://cdn.britannica.com/mendel-resources/3-124" rel="dns-prefetch"/>
<link href="https://cdn.britannica.com/mendel-resources/3-124" rel="preconnect"/>
<link as="script" href="https://www.googletagservices.com/tag/js/gpt.js" rel="preload"/>
<link href="/favicon.png" rel="icon"/>
<meta content="This is a list of the cities that are state capitals in the United States, ordered alphabetically by state. The list also provides the most recent U.S. census population for each city as well as an estimated population. (This list does not include the capital of the United States, Washington, D.C.)" name="description"/>
<meta content="list of state capitals in the United States, e

Find all tables in the page. In this website, though, there's only one so it's simple.

In [5]:
soup.find_all('table')

[<table> <thead> <tr> <th>state</th> <th>capital</th> <th>population of capital: census</th> <th>population of capital: estimated</th> </tr> </thead> <tbody> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alabama-state">Alabama</a></td> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Montgomery-Alabama">Montgomery</a></td> <td>(2020) 200,603</td> <td>(2021 est.) 198,665</td> </tr> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alaska">Alaska</a></td> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Juneau">Juneau</a></td> <td>(2020) 32,255</td> <td>(2021 est.) 31,973</td> </tr> <tr> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Arizona-state">Arizona</a></td> <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Phoenix

Let's first get just the title of the table.

*Note: `<th>` tag defines a header cell in an HTML table*

In [6]:
titles = soup.find_all('th')
titles

[<th>state</th>,
 <th>capital</th>,
 <th>population of capital: census</th>,
 <th>population of capital: estimated</th>]

Since we do not need the tags, let's clean up the data.

In [7]:
titles_list = [title.text for title in titles]
titles_list

['state',
 'capital',
 'population of capital: census',
 'population of capital: estimated']

If the output still contains newline and other symbols that are not needed, you can further clean the data using, for example, ```.strip()```

Next, create a dataframe

In [8]:
import pandas as pd

df = pd.DataFrame(columns = titles_list)
df

Unnamed: 0,state,capital,population of capital: census,population of capital: estimated


Let's scrape the remaining data and fill this table!

In [9]:
rows = soup.find_all('tr')
len(rows)

51

The data of our interest are within the scope of **td** tags.

*Note: `<td>` tag defines a standard data cell in an HTML table.*

In [10]:
# -- Long version --
# row_data = []
# for row in rows:
#   row_data.append(row.find_all('td'))

# -- Short version --
row_data = [row.find_all('td') for row in rows]
row_data

[[],
 [<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alabama-state">Alabama</a></td>,
  <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Montgomery-Alabama">Montgomery</a></td>,
  <td>(2020) 200,603</td>,
  <td>(2021 est.) 198,665</td>],
 [<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alaska">Alaska</a></td>,
  <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Juneau">Juneau</a></td>,
  <td>(2020) 32,255</td>,
  <td>(2021 est.) 31,973</td>],
 [<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Arizona-state">Arizona</a></td>,
  <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Phoenix-Arizona">Phoenix</a></td>,
  <td>(2020) 1,608,139</td>,
  <td>(2021 est.) 1,624,569</td>],
 [<td><a class="md-crosslink" data-show-preview="true" hr

The first row collected has no value, thus an empty list.

In [11]:
row_data[0]

[]

Turns out the first row is actually here

In [12]:
row_data[1]

[<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alabama-state">Alabama</a></td>,
 <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Montgomery-Alabama">Montgomery</a></td>,
 <td>(2020) 200,603</td>,
 <td>(2021 est.) 198,665</td>]

However, we only want the text portion.

In [13]:
print(row_data[1][0].text)
print(row_data[1][1].text)
print(row_data[1][2].text)
print(row_data[1][3].text)

Alabama
Montgomery
(2020) 200,603
(2021 est.) 198,665


Therefore, more cleaning is necessary.

Add the remaining rows to the dataframe.

But does this code work?

In [14]:
for each_row in row_data[1]:
    for elem in each_row:
        print(elem.text)

Alabama
Montgomery
(2020) 200,603
(2021 est.) 198,665


In [15]:
row_data[1:]

[[<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alabama-state">Alabama</a></td>,
  <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Montgomery-Alabama">Montgomery</a></td>,
  <td>(2020) 200,603</td>,
  <td>(2021 est.) 198,665</td>],
 [<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Alaska">Alaska</a></td>,
  <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Juneau">Juneau</a></td>,
  <td>(2020) 32,255</td>,
  <td>(2021 est.) 31,973</td>],
 [<td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Arizona-state">Arizona</a></td>,
  <td><a class="md-crosslink" data-show-preview="true" href="https://www.britannica.com/place/Phoenix-Arizona">Phoenix</a></td>,
  <td>(2020) 1,608,139</td>,
  <td>(2021 est.) 1,624,569</td>],
 [<td><a class="md-crosslink" data-show-preview="true" href="h

In [28]:
for sentence in row_data[1:]:
    info=[]
    for elem in sentence:
        info.append(elem.text)
    
    df.loc[len(df)] = info

In [30]:
df.drop(0, inplace=True)
df

Unnamed: 0,state,capital,population of capital: census,population of capital: estimated
1,Alabama,Montgomery,"(2020) 200,603","(2021 est.) 198,665"
2,Alaska,Juneau,"(2020) 32,255","(2021 est.) 31,973"
3,Arizona,Phoenix,"(2020) 1,608,139","(2021 est.) 1,624,569"
4,Arkansas,Little Rock,"(2020) 202,591","(2021 est.) 201,998"
5,California,Sacramento,"(2020) 524,943","(2021 est.) 525,041"
6,Colorado,Denver,"(2020) 715,522","(2021 est.) 711,463"
7,Connecticut,Hartford,"(2020) 121,054","(2021 est.) 120,576"
8,Delaware,Dover,"(2020) 39,403","(2021 est.) 38,992"
9,Florida,Tallahassee,"(2020) 196,068","(2021 est.) 197,102"
10,Georgia,Atlanta,"(2020) 498,715","(2021 est.) 496,461"


TODO:

- Scrape other table from wikipedia
- Generate a new table/tables using dataframe
- Feel free to use other html tags
- Clean & preprocess

---

Other websites (for instance)
- https://www.timesjobs.com/
- https://www.tripadvisor.com/