# Scrape deforestation data from W3 website

Import libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

Web scrape using Requests & BeautifulSoup

In [2]:
url = 'http://wdi.worldbank.org/table/3.4'
html = requests.get(url).content
soup = BeautifulSoup(html, "html.parser")
soup


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>
	World Development Indicators | The World Bank
</title><meta content="IE=edge" http-equiv="x-ua-compatible"/>
<script async="async" src="//script.crazyegg.com/pages/scripts/0058/1350.js" type="text/javascript"></script>
<link href="/styles/css/bootstrap.css" rel="stylesheet"/>
<!-- Custom styles for this template -->
<link href="/styles/css/main.css" rel="stylesheet"/>
<link href="/styles/css/responsive.css" rel="stylesheet"/>
<link href="/styles/css/icon.css" rel="stylesheet"/>
<link href="/Styles/style.css" rel="stylesheet"/>
<link href="/Styles/style-Header-Footer-nonArabic.css" rel="stylesheet" type="text/css"/>
<script src="/scripts/jquery-1.7.min.js" type="text/javascript"></script>
<script src="/scripts/jquery-ui.min.js" type="text/javascript"></script>
<script src="/scripts/jquery.layout.min.js" typ

Find the table in the soup

In [3]:
table = soup.find_all('div',{'class':'scrollable'})[0]
table

<div class="scrollable"> <table border="0" cellpadding="0" cellspacing="0" class="indicators-table" id="scrollTable"> <tbody> <tr> <td class="country"><div class="spacer"><a class="metaLink" data-customlink="nl:body" data-text="Metadata:Afghanistan" href="javascript:void(0)" onclick="loadMetaData('AFG', 'C' ,'Country',  'Afghanistan')">Afghanistan</a></div></td> <td class=""><div class="spacer">14</div></td> <td class=""><div class="spacer">14</div></td> <td class=""><div class="spacer">11</div></td> <td class=""><div class="spacer">16</div></td> <td class=""><div class="spacer">4</div></td> <td class=""><div class="spacer">5</div></td> <td class=""><div class="spacer">0.1</div></td> <td><div class="spacer">..</div></td> </tr> <tr> <td class="country"><div class="spacer"><a class="metaLink" data-customlink="nl:body" data-text="Metadata:Albania" href="javascript:void(0)" onclick="loadMetaData('ALB', 'C' ,'Country',  'Albania')">Albania</a></div></td> <td class=""><div class="spacer">8</

Create a list from the table extracted

In [4]:
rows = table.find_all('tr')
rows = [row.text.strip().split("\n") for row in rows]
rows

[['Afghanistan 14 14 11 16 4 5 0.1 ..'],
 ['Albania 8 8 3 8 44 4 17.7 2.7'],
 ['Algeria 17 20 14 15 41 22 7.5 0.1'],
 ['American Samoa 0 0 1 8 12 1 15.8 8.7'],
 ['Andorra 0 0 2 3 0 0 26.7 ..'],
 ['Angola 610 577 18 32 53 34 7.0 0.0'],
 ['Antigua and Barbuda 0 0 2 3 31 4 18.6 0.2'],
 ['Argentina 348 268 38 52 42 70 8.8 3.8'],
 ['Armenia 3 3 9 14 3 74 23.1 ..'],
 ['Aruba 0 0 2 2 24 2 18.9 0.0'],
 ['Australia 1,285 1,251 63 52 125 108 19.3 40.6'],
 ['Austria 38 39 3 13 11 17 28.4 ..'],
 ['Azerbaijan 9 12 8 17 14 44 10.2 0.4'],
 ['Bahamas, The 5 5 5 10 43 7 36.6 7.9'],
 ['Bahrain 0 0 3 7 14 0 6.6 1.2'],
 ['Bangladesh 15 14 37 36 29 23 4.6 5.4'],
 ['Barbados 0 0 3 4 29 3 1.3 0.0'],
 ['Belarus 78 87 4 9 2 1 9.4 ..'],
 ['Belgium 7 7 2 8 13 0 23.3 36.7'],
 ['Belize 16 14 10 6 45 46 37.7 10.1'],
 ['Benin 58 43 13 12 40 20 29.6 0.0'],
 ['Bermuda 0 0 4 3 27 8 2.1 0.0'],
 ['Bhutan 25 28 25 21 3 43 48.0 ..'],
 ['Bolivia 628 545 21 55 8 106 30.9 ..'],
 ['Bosnia and Herzegovina 22 22 4 7 36 3 1.4 0.0

Create dataframe and apply regex to split numeric and non numeric values

In [5]:
df = pd.DataFrame(rows)

df["countries"]=df[0].str.extract('(\D+)')

df["numeric"]=df[0].str.extract('(\d.*\d)')

df

Unnamed: 0,0,countries,numeric
0,Afghanistan 14 14 11 16 4 5 0.1 ..,Afghanistan,14 14 11 16 4 5 0.1
1,Albania 8 8 3 8 44 4 17.7 2.7,Albania,8 8 3 8 44 4 17.7 2.7
2,Algeria 17 20 14 15 41 22 7.5 0.1,Algeria,17 20 14 15 41 22 7.5 0.1
3,American Samoa 0 0 1 8 12 1 15.8 8.7,American Samoa,0 0 1 8 12 1 15.8 8.7
4,Andorra 0 0 2 3 0 0 26.7 ..,Andorra,0 0 2 3 0 0 26.7
...,...,...,...
221,"Sub-Saharan Africa 6,516 6,115 967 993 2,064 4...",Sub-Saharan Africa,"6,516 6,115 967 993 2,064 4,862 17.7"
222,"Low income 3,110 3,168 582 590 978 2,373 13.9 ..",Low income,"3,110 3,168 582 590 978 2,373 13.9"
223,"Lower middle income 6,058 5,474 1,040 1,230 2,...",Lower middle income,"6,058 5,474 1,040 1,230 2,144 4,315 14.3 1.3"
224,"Upper middle income 21,533 20,989 1,221 1,730 ...",Upper middle income,"21,533 20,989 1,221 1,730 2,431 7,431 14.8 7.9"


Split the numeric column into more columns & concat the result of the split and previous dataframe into one dataframe

In [6]:
numeric_df=df['numeric'].str.split(expand=True)

complete_df=pd.concat([df,numeric_df],axis=1)

complete_df.head()

Unnamed: 0,0,countries,numeric,0.1,1,2,3,4,5,6,7
0,Afghanistan 14 14 11 16 4 5 0.1 ..,Afghanistan,14 14 11 16 4 5 0.1,14,14,11,16,4,5,0.1,
1,Albania 8 8 3 8 44 4 17.7 2.7,Albania,8 8 3 8 44 4 17.7 2.7,8,8,3,8,44,4,17.7,2.7
2,Algeria 17 20 14 15 41 22 7.5 0.1,Algeria,17 20 14 15 41 22 7.5 0.1,17,20,14,15,41,22,7.5,0.1
3,American Samoa 0 0 1 8 12 1 15.8 8.7,American Samoa,0 0 1 8 12 1 15.8 8.7,0,0,1,8,12,1,15.8,8.7
4,Andorra 0 0 2 3 0 0 26.7 ..,Andorra,0 0 2 3 0 0 26.7,0,0,2,3,0,0,26.7,


Create a copy of the dataframe

In [7]:
forest=complete_df.copy()
forest

Unnamed: 0,0,countries,numeric,0.1,1,2,3,4,5,6,7
0,Afghanistan 14 14 11 16 4 5 0.1 ..,Afghanistan,14 14 11 16 4 5 0.1,14,14,11,16,4,5,0.1,
1,Albania 8 8 3 8 44 4 17.7 2.7,Albania,8 8 3 8 44 4 17.7 2.7,8,8,3,8,44,4,17.7,2.7
2,Algeria 17 20 14 15 41 22 7.5 0.1,Algeria,17 20 14 15 41 22 7.5 0.1,17,20,14,15,41,22,7.5,0.1
3,American Samoa 0 0 1 8 12 1 15.8 8.7,American Samoa,0 0 1 8 12 1 15.8 8.7,0,0,1,8,12,1,15.8,8.7
4,Andorra 0 0 2 3 0 0 26.7 ..,Andorra,0 0 2 3 0 0 26.7,0,0,2,3,0,0,26.7,
...,...,...,...,...,...,...,...,...,...,...,...
221,"Sub-Saharan Africa 6,516 6,115 967 993 2,064 4...",Sub-Saharan Africa,"6,516 6,115 967 993 2,064 4,862 17.7",6516,6115,967,993,2064,4862,17.7,
222,"Low income 3,110 3,168 582 590 978 2,373 13.9 ..",Low income,"3,110 3,168 582 590 978 2,373 13.9",3110,3168,582,590,978,2373,13.9,
223,"Lower middle income 6,058 5,474 1,040 1,230 2,...",Lower middle income,"6,058 5,474 1,040 1,230 2,144 4,315 14.3 1.3",6058,5474,1040,1230,2144,4315,14.3,1.3
224,"Upper middle income 21,533 20,989 1,221 1,730 ...",Upper middle income,"21,533 20,989 1,221 1,730 2,431 7,431 14.8 7.9",21533,20989,1221,1730,2431,7431,14.8,7.9


Check data types, rows and columns

In [8]:
forest.dtypes

0            object
countries    object
numeric      object
0            object
1            object
2            object
3            object
4            object
5            object
6            object
7            object
dtype: object

In [9]:
forest.index

RangeIndex(start=0, stop=226, step=1)

In [10]:
forest.size

2486

In [11]:
forest.columns

Index([0, 'countries', 'numeric', 0, 1, 2, 3, 4, 5, 6, 7], dtype='object')

We want the information by country, check 'countries' column for values that contain 'World' as we will want to drop these

In [12]:
index_world = forest[(forest['countries'].str.contains('World'))].index.tolist()

index_world

[214]

Drop non-country information

In [13]:
forest=forest.iloc[:214]

forest.head()

Unnamed: 0,0,countries,numeric,0.1,1,2,3,4,5,6,7
0,Afghanistan 14 14 11 16 4 5 0.1 ..,Afghanistan,14 14 11 16 4 5 0.1,14,14,11,16,4,5,0.1,
1,Albania 8 8 3 8 44 4 17.7 2.7,Albania,8 8 3 8 44 4 17.7 2.7,8,8,3,8,44,4,17.7,2.7
2,Algeria 17 20 14 15 41 22 7.5 0.1,Algeria,17 20 14 15 41 22 7.5 0.1,17,20,14,15,41,22,7.5,0.1
3,American Samoa 0 0 1 8 12 1 15.8 8.7,American Samoa,0 0 1 8 12 1 15.8 8.7,0,0,1,8,12,1,15.8,8.7
4,Andorra 0 0 2 3 0 0 26.7 ..,Andorra,0 0 2 3 0 0 26.7,0,0,2,3,0,0,26.7,


Rename columns

In [14]:
forest.columns=[0,1,2,3,4,5,6,7,8,9,10]

Drop merged columns

In [15]:
forest=forest.drop(columns=[0,2])
forest

Unnamed: 0,1,3,4,5,6,7,8,9,10
0,Afghanistan,14,14,11,16,4,5,0.1,
1,Albania,8,8,3,8,44,4,17.7,2.7
2,Algeria,17,20,14,15,41,22,7.5,0.1
3,American Samoa,0,0,1,8,12,1,15.8,8.7
4,Andorra,0,0,2,3,0,0,26.7,
...,...,...,...,...,...,...,...,...,...
209,Virgin Islands (U.S.),0,0,1,3,29,17,13.8,0.9
210,West Bank and Gaza,0,0,4,15,2,6,8.4,
211,"Yemen, Rep.",5,5,11,16,40,163,0.8,0.5
212,Zambia,528,485,13,20,20,23,37.9,


Check for null values

In [16]:
none_columns=forest.isnull().sum()
none_columns[none_columns > 0]

3      1
4      1
5      2
6      2
7      3
8      4
9      9
10    50
dtype: int64

Replace nulls with 'NA'

In [17]:
forest=forest.fillna('NA')

Check for and drop duplicates

In [18]:
before=len(forest)

forest.drop_duplicates(inplace=True)

after = len(forest)
print('Number of duplicate records dropped: ', str(before - after))

Number of duplicate records dropped:  0


Renaming columns according to original data

In [19]:
forest=forest.rename(columns={1:"Country_name",3:"1990_sq_km_thousands_forest_area",4:"2016_sq_km_thousands_forest_area", 5:"2018_threatened_mammals", 6:"2018_threatened_birds", 7:"2018_threatened_fishes", 8: "2018_higher_plants", 9: "%total_protected_land_area_2017", 10: "%total_protected_territorial_waters"})

forest.head()

Unnamed: 0,Country_name,1990_sq_km_thousands_forest_area,2016_sq_km_thousands_forest_area,2018_threatened_mammals,2018_threatened_birds,2018_threatened_fishes,2018_higher_plants,%total_protected_land_area_2017,%total_protected_territorial_waters
0,Afghanistan,14,14,11,16,4,5,0.1,
1,Albania,8,8,3,8,44,4,17.7,2.7
2,Algeria,17,20,14,15,41,22,7.5,0.1
3,American Samoa,0,0,1,8,12,1,15.8,8.7
4,Andorra,0,0,2,3,0,0,26.7,


Change the alpha column to title style

In [20]:
forest['Country_name']=(forest['Country_name'].str.title())
forest['Country_name']=(forest['Country_name'].str.rstrip())

In [21]:
forest.head()

Unnamed: 0,Country_name,1990_sq_km_thousands_forest_area,2016_sq_km_thousands_forest_area,2018_threatened_mammals,2018_threatened_birds,2018_threatened_fishes,2018_higher_plants,%total_protected_land_area_2017,%total_protected_territorial_waters
0,Afghanistan,14,14,11,16,4,5,0.1,
1,Albania,8,8,3,8,44,4,17.7,2.7
2,Algeria,17,20,14,15,41,22,7.5,0.1
3,American Samoa,0,0,1,8,12,1,15.8,8.7
4,Andorra,0,0,2,3,0,0,26.7,


Export to CSV file


In [22]:
forest.to_csv('..\csv_files\forest.csv', index=False)