<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [44]:
import csv
import requests
from bs4 import BeautifulSoup
from requests import get
from requests.exceptions import RequestException
from contextlib import closing

In [48]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [49]:
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

In [50]:
def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [51]:
raw_html = simple_get('https://realpython.com/blog/')
len(raw_html)

418006

In [53]:
no_html = simple_get('https://realpython.com/blog/nope-not-gonna-find-it')
no_html is None

True

In [70]:
raw_html = simple_get('https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_immigrant_population')
html = BeautifulSoup(raw_html, 'html.parser')

In [71]:
table = html.find_all('table')[1]

In [72]:
table

<table class="wikitable sortable">
<tbody><tr>
<th rowspan="2">Country
</th>
<th colspan="2">Emigrants
</th>
<th rowspan="2">Notes
</th></tr>
<tr>
<th>Total
</th>
<th>Of nation<sup class="reference" id="cite_ref-6"><a href="#cite_note-6">[6]</a></sup>
</th></tr>
<tr>
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/23px-Flag_of_Afghanistan.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/35px-Flag_of_Afghanistan.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/45px-Flag_of_Afghanistan.svg.png 2x" width="23"/> </span><a href="/wiki/Afghanistan" title="Afghanistan">Afghanistan</a></td>
<td align="right"><span data-sort-value="7006484311700000000♠">4,843,117</span></td>
<td>12.96%</td>
<td>
</td></tr>
<tr>
<td><span class="flagicon"><img alt="

In [76]:
rows = table.select('tbody > tr')


In [77]:
header = [th.text.rstrip() for th in rows[0].find_all('th')]
header

['Country', 'Emigrants', 'Notes']

In [79]:
 with open('output.csv', 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(header)
        for row in rows[1:]:
            data = [th.text.rstrip() for th in row.find_all('td')]
            writer.writerow(data)

In [8]:
import csv
import requests
from bs4 import BeautifulSoup
import 


def scrape_data(url):

    response = requests.get(url, timeout=10)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find_all('table')[1]

    rows = table.select('tbody > tr')

    header = [th.text.rstrip() for th in rows[0].find_all('th')]

    with open('output.csv', 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(header)
        for row in rows[1:]:
            data = [th.text.rstrip() for th in row.find_all('td')]
            writer.writerow(data)


if __name__=="__main__":
    url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"
    scrape_data(url)

In [37]:
url = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population#Sovereign_states_and_dependencies_by_population" 
response = requests.get(url, timeout=10)
response

<Response [200]>

In [38]:
soup = BeautifulSoup(response.content, 'html.parser')                    

In [39]:
table = soup.find_all('table')[1]

In [40]:
rows = table.select('tbody > tr')

In [41]:
header = [th.text.rstrip() for th in rows[0].find_all('th')]
header

['vteLists of countries by population statistics']

In [42]:
with open('output.csv', 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(header)
        for row in rows[1:]:
            data = [th.text.rstrip() for th in row.find_all('td')]
            writer.writerow(data)

In [129]:
# Import packages
import requests
from bs4 import BeautifulSoup

# Specify url: url
url='http://pokemondb.net/pokedex/all_by_population'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Extracts the response as html: html_doc
html_doc = r.text

# Create a BeautifulSoup object from the HTML: soup
soup = BeautifulSoup(html_doc)

# Prettify the BeautifulSoup object: pretty_soup
pretty_soup = soup.prettify()

# Print the response
print(pretty_soup)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of countries and dependencies by population - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_countries_and_dependencies_by_population","wgTitle":"List of countries and dependencies by population","wgCurRevisionId":895410785,"wgRevisionId":895410785,"wgArticleId":69058,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Lists of countries by population"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitT



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [81]:
import requests
import lxml.html as lh
import pandas as pd

In [157]:
url='https://en.wikipedia.org/wiki/List_of_national_independence_days'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [158]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5]

In [159]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Country
"
2:"Date of holiday
"
3:"Year celebrated
"
4:"Event celebrated
"
5:"Name of holiday
"


In [163]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 6, the //tr data is not from our table 
    #if len(T)!=5:
        #break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
            col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [164]:
[print(title,C) for (title,C) in col]

Country
 []
Date of holiday
 ['August\xa019\n', 'November\xa028\n', 'July\xa05\n', 'November\xa011\n', 'May\xa030\n', 'November\xa01\n', 'July\xa09\n', 'May\xa028\n', 'August\xa019\n', 'November\xa028\n', 'July\xa05\n', 'November\xa011\n', 'May\xa030\n', 'November\xa01\n', 'July\xa09\n', 'May\xa028\n', '1991\n', 'January 1\n', 'October\xa026\n', 'May\xa028\n', '1991\n', 'July\xa010\n', 'December\xa016\n', 'March\xa026\n', 'November\xa030\n', 'July\xa03\n', 'July\xa021\n', 'September\xa021\n', 'August\xa01\n', 'August\xa06\n', 'March\xa01\n', 'September\xa030\n', 'September\xa07\n', 'January\xa01\n', 'March\xa03\n', 'August\xa05\n', 'July\xa01\n', 'November\xa09\n', 'January\xa01\n', 'July\xa01\n', 'July\xa05\n', 'August\xa013\n', 'August\xa011\n', 'September\xa018\n', '1818', 'October 1\n', 'July\xa020 and August\xa07\n', 'July\xa06\n', 'June\xa030\n', 'August\xa015\n', 'September\xa015\n', 'August\xa07\n', 'October\xa08\n', 'January\xa01\n', 'October\xa01\n', 'October\xa028\n', '1993\

[None, None, None, None, None]

In [162]:
Dict={title:column for (title,column) in col}
Dict

{'Country\n': [],
 'Date of holiday\n': ['August\xa019\n',
  'November\xa028\n',
  'July\xa05\n',
  'November\xa011\n',
  'May\xa030\n',
  'November\xa01\n',
  'July\xa09\n',
  'May\xa028\n'],
 'Year celebrated\n': ['1919\n',
  '1912\n',
  '1962\n',
  '1975\n',
  '1967\n',
  '1981\n',
  '1816\n',
  '1918\n'],
 'Event celebrated\n': ['Independence from the United Kingdom in 1919.\n',
  'Declared by Ismail Qemal Vlora in 1912 and signaled the end of five centuries of Ottoman rule.\n',
  'Independence from France in 1962.\n',
  'Independence from Portugal in 1975.\n',
  'Independence from St. Christopher-Nevis-Anguilla in 1967.\n',
  'Independence from the United Kingdom in 1981.\n',
  'Independence declared from the Spanish Empire in 1816.\n',
  'Declaration of independence from Russian Empire in 1918.\n'],
 'Name of holiday\n': ['Afghan Independence Day\n',
  'Independence Day/Dita e Pavarësisë\n',
  'Independence Day (Algeria)\n',
  '\n',
  'Anguilla Day\n',
  'Independence Day [1]\n',

In [None]:
df=pd.DataFrame(Dict)

In [106]:
df.head()

Unnamed: 0,Rank,Country(or dependent territory),Population,Date,% of worldpopulation,Source


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 6 columns):
Rank                               0 non-null float64
Country(or dependent territory)    0 non-null float64
Population                         0 non-null float64
Date                               0 non-null float64
% of worldpopulation               0 non-null float64
Source
                            0 non-null float64
dtypes: float64(6)
memory usage: 76.0 bytes


In [216]:
import urllib.request
import requests
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup


from requests import get
from requests.exceptions import RequestException
from contextlib import closing

In [219]:
#specify the url
# wiki = "https://en.wikipedia.org/wiki/List_of_state_and_union_territory_capitals_in_India"
wiki = "https://en.wikipedia.org/wiki/List_of_national_independence_days"

In [220]:
page = urllib.request.urlopen(wiki)

In [221]:
#Parse the html in the 'page' variable, and store it in Beautiful Soup format
soup = BeautifulSoup(page)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [222]:
# print(soup.prettify)

In [226]:
soup.title.string

'List of national independence days - Wikipedia'

In [224]:
all_links = soup.find_all("a")
for link in all_links:
    print(link.get("href"))

None
#mw-head
#p-search
/wiki/Independence_Day_(disambiguation)
/wiki/National_day
/wiki/File:National_Days_map.png
/wiki/File:National_Days_map.png
/wiki/National_Day
/wiki/Anniversary
/wiki/Nation
/wiki/Sovereign_state
/wiki/Military_occupation
/wiki/Singapore_in_Malaysia
/wiki/National_day
#List
#See_also
#References
#External_links
/w/index.php?title=List_of_national_independence_days&action=edit&section=1
/wiki/Afghanistan
/wiki/United_Kingdom
/wiki/Afghan_Independence_Day
/wiki/Albania
/wiki/Ismail_Qemali
/wiki/Ottoman_Empire
/wiki/Albanian_Independence_Day
/wiki/Algeria
/wiki/France
/wiki/Independence_Day_(Algeria)
/wiki/Angola
/wiki/Portugal
/wiki/Anguilla
/wiki/St._Christopher-Nevis-Anguilla
/wiki/Antigua_and_Barbuda
/wiki/United_Kingdom
#cite_note-1
/wiki/Argentina
/wiki/Spanish_Empire
/wiki/Argentine_Declaration_of_Independence
/wiki/Armenia
/wiki/Russian_Empire
/wiki/Soviet_Union
/wiki/Independence_Day_(Armenia)
/wiki/Australia
/wiki/United_Kingdom
/wiki/Australia_Day
/wiki

In [225]:
all_tables=soup.find_all('table')
all_tables

[<table class="wikitable sortable">
 <tbody><tr bgcolor="#cccccc">
 <th width="17%">Country
 </th>
 <th width="9%">Date of holiday
 </th>
 <th width="4%">Year celebrated
 </th>
 <th width="40%">Event celebrated
 </th>
 <th width="15%">Name of holiday
 </th></tr>
 <tr style="vertical-align: top;">
 <td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/23px-Flag_of_Afghanistan.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/35px-Flag_of_Afghanistan.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/45px-Flag_of_Afghanistan.svg.png 2x" width="23"/> </span><a href="/wiki/Afghanistan" title="Afghanistan">Afghanistan</a>
 </td>
 <td><span data-sort-value="08-19 !">August 19</span>
 </td>
 <td>1919
 </td>
 <td>Independence from the <a href="/wiki/United

In [248]:
right_table=soup.find('table', {"class":'wikitable sortable'})
print(right_table)

<table class="wikitable sortable">
<tbody><tr bgcolor="#cccccc">
<th width="17%">Country
</th>
<th width="9%">Date of holiday
</th>
<th width="4%">Year celebrated
</th>
<th width="40%">Event celebrated
</th>
<th width="15%">Name of holiday
</th></tr>
<tr style="vertical-align: top;">
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/23px-Flag_of_Afghanistan.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/35px-Flag_of_Afghanistan.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Flag_of_Afghanistan.svg/45px-Flag_of_Afghanistan.svg.png 2x" width="23"/> </span><a href="/wiki/Afghanistan" title="Afghanistan">Afghanistan</a>
</td>
<td><span data-sort-value="08-19 !">August 19</span>
</td>
<td>1919
</td>
<td>Independence from the <a href="/wiki/United_Kingdom" title="Uni

In [252]:
#Generate lists
A=[]
B=[]
C=[]
D=[]
E=[]
F=[]
G=[]
for row in right_table.findAll("tr"):
    cells = row.findAll('td')
    states=row.findAll('title') #To store second column data
     if len(cells)==5: #Only extract table body not heading
        A.append(cells[0].find(text=True))
        B.append(states[0].find(text=True))
        C.append(cells[1].find(text=True))
        D.append(cells[2].find(text=True))
        E.append(cells[3].find(text=True))
        F.append(cells[4].find(text=True))
        #G.append(cells[5].find(text=True))

IndentationError: unexpected indent (<ipython-input-252-8eb725123e68>, line 12)

In [251]:
#import pandas to convert list to data frame
import pandas as pd
df=pd.DataFrame(A,columns=['Number'])
df['Country']=B
df['Date of holiday']=C
df['Year celebrated']=D
df['Event celebrated']=E
df['Name of holiday']=F
#df['Former_Capital']=G
df

Unnamed: 0,Number,Country,Date of holiday,Year celebrated,Event celebrated,Name of holiday
