# Wikipedia Scraping API

## Import Libraries

In [13]:
import requests
import numpy as np
import pandas as pd
import json
import webbrowser
from bs4 import BeautifulSoup
from utils import *

## Web Page URL

In [14]:
url = "https://en.wikipedia.org/wiki/World_population"


## Request

In [15]:

page = requests.get(url)
req_response(page)

Request Response : 200 OK (Everything works as expected)


## Get Data

In [16]:
# parse document (create BeautifulSoup object)
data = BeautifulSoup(page.text, "html.parser")
print("html document parsed !!! ")

html document parsed !!! 


## Format Data

In [17]:
tables = data.find_all('table')
print(f"This page have : {len(tables)} Tables")

This page have : 26 Tables


In [18]:
for index,table in enumerate(tables):
    if ("10 most densely populated countries" in str(table)):
        table_index = index
print(f"{table_index} Tables with mention '10 most densely populated countries'")

5 Tables with mention '10 most densely populated countries'


In [19]:
print(tables[table_index].prettify())

<table class="wikitable sortable" style="text-align:right">
 <caption>
  10 most densely populated countries
  <small>
   (with population above 5 million)
  </small>
 </caption>
 <tbody>
  <tr>
   <th>
    Rank
   </th>
   <th>
    Country
   </th>
   <th>
    Population
   </th>
   <th>
    Area
    <br/>
    <small>
     (km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
   <th>
    Density
    <br/>
    <small>
     (pop/km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td align="left">
    <span class="flagicon">
     <img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/23px-Flag_of_Singapore.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/35px-Flag_of_Singapore.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapo

In [20]:
population_data = pd.DataFrame(columns=["Rank", "Country", "Population", "Area", "Density"])

for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        rank = col[0].text
        country = col[1].text
        population = col[2].text.strip()
        area = col[3].text.strip()
        density = col[4].text.strip()
        population_data = pd.concat([population_data, pd.DataFrame({"Rank":rank, "Country":country, "Population":population, "Area":area, "Density":density}, index=[0])])
        
population_data

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5704000,710,8033
0,2,Bangladesh,172590000,143998,1199
0,3,\n Palestine\n\n,5266785,6020,847
0,4,Lebanon,6856000,10452,656
0,5,Taiwan,23604000,36193,652
0,6,South Korea,51781000,99538,520
0,7,Rwanda,12374000,26338,470
0,8,Haiti,11578000,27065,428
0,9,Netherlands,17710000,41526,427
0,10,Israel,9510000,22072,431


In [21]:
pd.read_html(str(tables[5]), flavor='bs4')

[   Rank      Country  Population  Area(km2)  Density(pop/km2)
 0     1    Singapore     5704000        710              8033
 1     2   Bangladesh   172590000     143998              1199
 2     3    Palestine     5266785       6020               847
 3     4      Lebanon     6856000      10452               656
 4     5       Taiwan    23604000      36193               652
 5     6  South Korea    51781000      99538               520
 6     7       Rwanda    12374000      26338               470
 7     8        Haiti    11578000      27065               428
 8     9  Netherlands    17710000      41526               427
 9    10       Israel     9510000      22072               431]

In [22]:
population_data_read_html = pd.read_html(str(tables[5]), flavor='bs4')[0]
population_data_read_html

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,172590000,143998,1199
2,3,Palestine,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Haiti,11578000,27065,428
8,9,Netherlands,17710000,41526,427
9,10,Israel,9510000,22072,431


In [23]:
dataframe_list = pd.read_html(url, flavor='bs4')
len(dataframe_list)

26

In [24]:
pd.read_html(url, match="10 most densely populated countries", flavor='bs4')[0]
# Maptitude and MapInfo
# https://courses.cognitiveclass.ai/courses/course-v1:BigDataUniversity+DS0101EN+2016/courseware/bd64ccdf56ad4ea1afe870e26d583038/155e435d066a4a22a2c7a29

Unnamed: 0,Rank,Country,Population,Area(km2),Density(pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,172590000,143998,1199
2,3,Palestine,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Haiti,11578000,27065,428
8,9,Netherlands,17710000,41526,427
9,10,Israel,9510000,22072,431


In [25]:
df = pd.read_html(url, match="10 most densely populated countries", flavor='bs4')[0]
df.to_csv('df_scraped.csv')

___
My name is **Kiese Diangebeni Reagan** I'm **Data Science Analyst**, technology passionate person, Artificial Intelligence enthusiast and lifelong learner. 

   
|**Let get in touch :**| <a href="https://kiese.tech">www.kiese.tech</a>|<a href="https://github.com/Rekidiang2">GitHub</a>| <a href="https://www.linkedin.com/in/kiese-diangebeni-reagan-82992216a/">Linkedin</a>|<a href="https://twitter.com/ReaganKiese">Twitter</a>|<a href="http://www.facebook.com/reagan.kiese.37">FaceBook</a>|<a href="https://medium.com/@rkddatas">Medium</a>|
|--------------|-----------|-----------|-----------|-----------|-----------|-----------|

---