# Web scraping data from Animal Welfare Institute 


In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

### Scrape mammals table using Requests & BeautifulSoup

In [2]:
url = "https://awionline.org/content/list-endangered-species#mammals"

In [3]:
response = requests.get(url).content

In [4]:
response[:500]

b'<!DOCTYPE html>\n<html lang="en" dir="ltr" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">\n  <head>\n    <meta charset="utf-8" />\n<script>(function(i,s,o,g,r,a,m){i["GoogleAnal'

In [5]:
soup = BeautifulSoup(response)

In [6]:
table_mammals = soup.find_all('table', class_='esatable')[0]

In [7]:
table_mammals


<table cellpadding="0" cellspacing="0" class="esatable"><colgroup><col width="43%"/><col width="42%"/><col width="15%"/></colgroup><tbody><tr><th width="43%">Common Name</th>
<th width="42%">Scientific Name</th>
<th width="15%">Status</th>
</tr><tr><td width="43%">Addax</td>
<td width="42%"><em>Addax nasomaculatus</em></td>
<td width="15%">Endangered</td>
</tr><tr><td width="43%">Anoa, lowland</td>
<td width="42%"><em>Bubalus depressicornis</em></td>
<td width="15%">Endangered</td>
</tr><tr><td width="43%">Anoa, mountain</td>
<td width="42%"><em>Bubalus quarlesi</em></td>
<td width="15%">Endangered</td>
</tr><tr><td width="43%">Antelope, giant sable</td>
<td width="42%"><em>Hippotragus niger variani</em></td>
<td width="15%">Endangered</td>
</tr><tr><td width="43%">Antelope, Tibetan</td>
<td width="42%"><em>Panthalops hodgsonii</em></td>
<td width="15%">Endangered</td>
</tr><tr><td width="43%">Argali [All populations except Kyrgyzstan, Mongolia, and Tajikistan]</td>
<td width="42%"><em

In [8]:
rows = table_mammals.find_all('tr' )

In [9]:
rows = [row.text.strip().split('\n') for row in rows]

In [10]:
rows

[['Common Name', 'Scientific Name', 'Status'],
 ['Addax', 'Addax nasomaculatus', 'Endangered'],
 ['Anoa, lowland', 'Bubalus depressicornis', 'Endangered'],
 ['Anoa, mountain', 'Bubalus quarlesi', 'Endangered'],
 ['Antelope, giant sable', 'Hippotragus niger variani', 'Endangered'],
 ['Antelope, Tibetan', 'Panthalops hodgsonii', 'Endangered'],
 ['Argali [All populations except Kyrgyzstan, Mongolia, and Tajikistan]',
  'Ovis ammon',
  'Endangered'],
 ['Argali [Kyrgyzstan, Mongolia, and Tajikistan]', 'Ovis ammon', 'Threatened'],
 ['Armadillo, giant', 'Priodontes maximus', 'Endangered'],
 ['Armadillo, pink fairy', 'Chlamyphorus truncatus', 'Endangered'],
 ['Ass, African wild', 'Equus africanus', 'Endangered'],
 ['Ass, Asian wild', 'Equus hemionus', 'Endangered'],
 ['Avahi', 'Avahi laniger(=entire genus)', 'Endangered'],
 ['Aye-aye', 'Daubentonia madagascariensis', 'Endangered'],
 ['Babirusa', 'Babyrousa babyrussa', 'Endangered'],
 ['Baboon, gelada', 'Theropithecus gelada', 'Threatened'],
 [

In [11]:
column_names = rows[0]
data = rows[1:]

In [12]:
dataset_mammals = pd.DataFrame(data, columns = column_names) 

In [13]:
dataset_mammals

Unnamed: 0,Common Name,Scientific Name,Status
0,Addax,Addax nasomaculatus,Endangered
1,"Anoa, lowland",Bubalus depressicornis,Endangered
2,"Anoa, mountain",Bubalus quarlesi,Endangered
3,"Antelope, giant sable",Hippotragus niger variani,Endangered
4,"Antelope, Tibetan",Panthalops hodgsonii,Endangered
...,...,...,...
377,"Woodrat, riparian (San Joaquin Valley)",Neotoma fuscipes riparia,Endangered
378,"Yak, wild",Bos mutus (=grunniens m.),Endangered
379,"Zebra, Grevy's",Equus grevyi,Threatened
380,"Zebra, Hartmann's mountain",Equus zebra hartmannae,Threatened


In [14]:
colnames = [ name.replace(' ', '') for name in dataset_mammals.columns]

In [15]:
colnames

['CommonName', 'ScientificName', 'Status']

In [16]:
dataset_mammals = pd.DataFrame(data, columns = colnames )

In [17]:
dataset_mammals.head()

Unnamed: 0,CommonName,ScientificName,Status
0,Addax,Addax nasomaculatus,Endangered
1,"Anoa, lowland",Bubalus depressicornis,Endangered
2,"Anoa, mountain",Bubalus quarlesi,Endangered
3,"Antelope, giant sable",Hippotragus niger variani,Endangered
4,"Antelope, Tibetan",Panthalops hodgsonii,Endangered


In [18]:
dataset_mammals[['Family', 'Species']]= dataset_mammals.ScientificName.str.split(n=1, expand = True)

In [19]:
dataset_mammals.head(10)

Unnamed: 0,CommonName,ScientificName,Status,Family,Species
0,Addax,Addax nasomaculatus,Endangered,Addax,nasomaculatus
1,"Anoa, lowland",Bubalus depressicornis,Endangered,Bubalus,depressicornis
2,"Anoa, mountain",Bubalus quarlesi,Endangered,Bubalus,quarlesi
3,"Antelope, giant sable",Hippotragus niger variani,Endangered,Hippotragus,niger variani
4,"Antelope, Tibetan",Panthalops hodgsonii,Endangered,Panthalops,hodgsonii
5,"Argali [All populations except Kyrgyzstan, Mon...",Ovis ammon,Endangered,Ovis,ammon
6,"Argali [Kyrgyzstan, Mongolia, and Tajikistan]",Ovis ammon,Threatened,Ovis,ammon
7,"Armadillo, giant",Priodontes maximus,Endangered,Priodontes,maximus
8,"Armadillo, pink fairy",Chlamyphorus truncatus,Endangered,Chlamyphorus,truncatus
9,"Ass, African wild",Equus africanus,Endangered,Equus,africanus


In [20]:
dataset_mammals[['CommonName_1' , 'CommonName_2']]= (dataset_mammals.
                                                     CommonName.str
                                                     .split(',',n=1, expand = True))

In [21]:
dataset_mammals[['CommonName_new']] = dataset_mammals['CommonName_2'] +' '+ dataset_mammals['CommonName_1']

In [22]:
dataset_mammals['CommonName_new'] = np.where(dataset_mammals['CommonName_new'].isna(),
                                             dataset_mammals['CommonName'],
                                            dataset_mammals['CommonName_new'].str.title())

In [23]:
dataset_mammals = dataset_mammals.drop(['CommonName_1' , 'CommonName_2', 'CommonName'], axis = 1)

In [24]:
dataset_mammals.head(50)

Unnamed: 0,ScientificName,Status,Family,Species,CommonName_new
0,Addax nasomaculatus,Endangered,Addax,nasomaculatus,Addax
1,Bubalus depressicornis,Endangered,Bubalus,depressicornis,Lowland Anoa
2,Bubalus quarlesi,Endangered,Bubalus,quarlesi,Mountain Anoa
3,Hippotragus niger variani,Endangered,Hippotragus,niger variani,Giant Sable Antelope
4,Panthalops hodgsonii,Endangered,Panthalops,hodgsonii,Tibetan Antelope
5,Ovis ammon,Endangered,Ovis,ammon,"Mongolia, And Tajikistan] Argali [All Populat..."
6,Ovis ammon,Threatened,Ovis,ammon,"Mongolia, And Tajikistan] Argali [Kyrgyzstan"
7,Priodontes maximus,Endangered,Priodontes,maximus,Giant Armadillo
8,Chlamyphorus truncatus,Endangered,Chlamyphorus,truncatus,Pink Fairy Armadillo
9,Equus africanus,Endangered,Equus,africanus,African Wild Ass


Export dataframe to CSV

In [25]:
dataset_mammals = dataset_mammals.to_csv('..\csv_files\table_awionline_mammals.csv', index = False)

Table exported as csv without the index

### Finding table names

Tried some additional scraping which was not required for the project

In [26]:
url2 = 'https://awionline.org/content/list-endangered-species'

In [27]:
response2 = requests.get(url2).content

In [28]:
soup2 = BeautifulSoup(response2)

In [29]:
# finding the names of different tables on website 
# names is list of all button items in p-tag 

names = (soup2.body.find('main', role= 'main')
       .find('div', class_='node__content')
       .find_all('a', class_='esabutton'))
print(names) 

[<a class="esabutton" href="#mammals">Mammals</a>, <a class="esabutton" href="#birds">Birds</a>, <a class="esabutton" href="#reptiles">Reptiles</a>, <a class="esabutton" href="#amphibians">Amphibians</a>, <a class="esabutton" href="#fish">Fish</a>, <a class="esabutton" href="#other">Other</a>, <a class="esabutton" href="#clams">Clams</a>, <a class="esabutton" href="#snails">Snails</a>, <a class="esabutton" href="#insects">Insects</a>, <a class="esabutton" href="#arachnids">Arachnids</a>, <a class="esabutton" href="#crustaceans">Crustaceans</a>, <a class="esabutton" href="#corals">Corals</a>]


In [30]:
# taking a look at first item in list and extracting the name 
name = names[0]
name.text

'Mammals'

In [31]:
table_names = [name.text for name in names ]
table_names

['Mammals',
 'Birds',
 'Reptiles',
 'Amphibians',
 'Fish',
 'Other',
 'Clams',
 'Snails',
 'Insects',
 'Arachnids',
 'Crustaceans',
 'Corals']

In [32]:
tabellen = soup2.find_all('table', class_='esatable')
tabellen

[<table cellpadding="0" cellspacing="0" class="esatable"><colgroup><col width="43%"/><col width="42%"/><col width="15%"/></colgroup><tbody><tr><th width="43%">Common Name</th>
 <th width="42%">Scientific Name</th>
 <th width="15%">Status</th>
 </tr><tr><td width="43%">Addax</td>
 <td width="42%"><em>Addax nasomaculatus</em></td>
 <td width="15%">Endangered</td>
 </tr><tr><td width="43%">Anoa, lowland</td>
 <td width="42%"><em>Bubalus depressicornis</em></td>
 <td width="15%">Endangered</td>
 </tr><tr><td width="43%">Anoa, mountain</td>
 <td width="42%"><em>Bubalus quarlesi</em></td>
 <td width="15%">Endangered</td>
 </tr><tr><td width="43%">Antelope, giant sable</td>
 <td width="42%"><em>Hippotragus niger variani</em></td>
 <td width="15%">Endangered</td>
 </tr><tr><td width="43%">Antelope, Tibetan</td>
 <td width="42%"><em>Panthalops hodgsonii</em></td>
 <td width="15%">Endangered</td>
 </tr><tr><td width="43%">Argali [All populations except Kyrgyzstan, Mongolia, and Tajikistan]</td>


In [33]:
for table in tabellen: 
    rows2 = table.find_all('tr')

In [34]:
rows2 = [row.text.strip().split('\n') for row in rows2]

In [35]:
column_names2 = rows2[0]
data2 = rows2[1:]

In [36]:
dataset_all = pd.DataFrame(data2, columns = column_names2)