## To perform basic Web-Scraping tasks using the Beautiful soup package in Python.

In [4]:
#Step1: Choose URL for webscraping
#Step2: Identify data to be scrapped
#Step3: Inspect the website
#Step4: Select tool for webscraping that can parse the HTML
#Step5: Downlaod files to run librabries

In [3]:
from bs4 import BeautifulSoup as bs
import requests

In [6]:
#Step6: Send HHTP request to url
url = 'https://www.scrapethissite.com/pages/simple/'
response = requests.get(url)
response

<Response [200]>

In [10]:
#Step7: create soup object and look at sit title
soup = bs(response.text)

print(type(soup))

soup.find('title').text

<class 'bs4.BeautifulSoup'>


'Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping'

In [12]:
#Step8: Extract data in tags in HTML

soup.find_all('div')
country_all = soup.find_all('div',class_="col-md-4 country")
country_all[:2]

[<div class="col-md-4 country">
 <h3 class="country-name">
 <i class="flag-icon flag-icon-ad"></i>
                             Andorra
                         </h3>
 <div class="country-info">
 <strong>Capital:</strong> <span class="country-capital">Andorra la Vella</span><br/>
 <strong>Population:</strong> <span class="country-population">84000</span><br/>
 <strong>Area (km<sup>2</sup>):</strong> <span class="country-area">468.0</span><br/>
 </div>
 </div>,
 <div class="col-md-4 country">
 <h3 class="country-name">
 <i class="flag-icon flag-icon-ae"></i>
                             United Arab Emirates
                         </h3>
 <div class="country-info">
 <strong>Capital:</strong> <span class="country-capital">Abu Dhabi</span><br/>
 <strong>Population:</strong> <span class="country-population">4975593</span><br/>
 <strong>Area (km<sup>2</sup>):</strong> <span class="country-area">82880.0</span><br/>
 </div>
 </div>]

In [13]:
country_all

[<div class="col-md-4 country">
 <h3 class="country-name">
 <i class="flag-icon flag-icon-ad"></i>
                             Andorra
                         </h3>
 <div class="country-info">
 <strong>Capital:</strong> <span class="country-capital">Andorra la Vella</span><br/>
 <strong>Population:</strong> <span class="country-population">84000</span><br/>
 <strong>Area (km<sup>2</sup>):</strong> <span class="country-area">468.0</span><br/>
 </div>
 </div>,
 <div class="col-md-4 country">
 <h3 class="country-name">
 <i class="flag-icon flag-icon-ae"></i>
                             United Arab Emirates
                         </h3>
 <div class="country-info">
 <strong>Capital:</strong> <span class="country-capital">Abu Dhabi</span><br/>
 <strong>Population:</strong> <span class="country-population">4975593</span><br/>
 <strong>Area (km<sup>2</sup>):</strong> <span class="country-area">82880.0</span><br/>
 </div>
 </div>,
 <div class="col-md-4 country">
 <h3 class="country-name">
 

In [38]:
#Step9: make function and extract all info
def get_details(country_one):
    country_name = country_one.find('h3', class_="country-name").text.strip()
    captil_name = country_one.find('span', class_="country-capital").text
    population = country_one.find('span', class_="country-population").text
    area = country_one.find('span', class_="country-area").text
    return country_name, captil_name, population, area

In [39]:
#Step10: Writing the scraping code
def get_soup(url):
    #takes url and return soup object
    resp = requests.get(url)
    if resp.status_code == 200:
        return bs(resp.text)
    else: return None

def get_countries(url):
    soup = get_soup(url)
    country_all = soup.find_all('div', class_="col-md-4 country")
    countries = []
    for country in country_all:
        countries.append(get_details(country))
    return countries

In [43]:
url = 'https://www.scrapethissite.com/pages/simple/'
output = get_countries(url)
#type(output)
output

[('Andorra', 'Andorra la Vella', '84000', '468.0'),
 ('United Arab Emirates', 'Abu Dhabi', '4975593', '82880.0'),
 ('Afghanistan', 'Kabul', '29121286', '647500.0'),
 ('Antigua and Barbuda', "St. John's", '86754', '443.0'),
 ('Anguilla', 'The Valley', '13254', '102.0'),
 ('Albania', 'Tirana', '2986952', '28748.0'),
 ('Armenia', 'Yerevan', '2968000', '29800.0'),
 ('Angola', 'Luanda', '13068161', '1246700.0'),
 ('Antarctica', 'None', '0', '1.4E7'),
 ('Argentina', 'Buenos Aires', '41343201', '2766890.0'),
 ('American Samoa', 'Pago Pago', '57881', '199.0'),
 ('Austria', 'Vienna', '8205000', '83858.0'),
 ('Australia', 'Canberra', '21515754', '7686850.0'),
 ('Aruba', 'Oranjestad', '71566', '193.0'),
 ('Åland', 'Mariehamn', '26711', '1580.0'),
 ('Azerbaijan', 'Baku', '8303512', '86600.0'),
 ('Bosnia and Herzegovina', 'Sarajevo', '4590000', '51129.0'),
 ('Barbados', 'Bridgetown', '285653', '431.0'),
 ('Bangladesh', 'Dhaka', '156118464', '144000.0'),
 ('Belgium', 'Brussels', '10403000', '30510.0

In [46]:
output[:10]


[('Andorra', 'Andorra la Vella', '84000', '468.0'),
 ('United Arab Emirates', 'Abu Dhabi', '4975593', '82880.0'),
 ('Afghanistan', 'Kabul', '29121286', '647500.0'),
 ('Antigua and Barbuda', "St. John's", '86754', '443.0'),
 ('Anguilla', 'The Valley', '13254', '102.0'),
 ('Albania', 'Tirana', '2986952', '28748.0'),
 ('Armenia', 'Yerevan', '2968000', '29800.0'),
 ('Angola', 'Luanda', '13068161', '1246700.0'),
 ('Antarctica', 'None', '0', '1.4E7'),
 ('Argentina', 'Buenos Aires', '41343201', '2766890.0')]

In [52]:
import pandas as pd

df = pd.DataFrame(output, columns=['Country', 'Capital', 'Population', 'Area'])
df.to_csv("CountryInfo.csv",index=False)