# Web Scraping Using BeautifulSoup

In the first part  we explore the basics of webscraping and beautifulSoup

In [177]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [178]:
#attempt access to website and print the status code to check if thepage is accessed
#code 200 means the page is accessed
access = requests.get('https://www.whitehouse.gov/briefings-statements/')
print(access.status_code)

#Print the headers of the website
print(access.headers)

200
{'Content-Type': 'text/html; charset=UTF-8', 'Accept-Ranges': 'bytes', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip', 'Date': 'Thu, 02 May 2019 17:38:09 GMT', 'Content-Length': '13464', 'Connection': 'keep-alive', 'Strict-Transport-Security': 'max-age=31536000 ; includeSubDomains ; preload', 'X-Frame-Options': 'SAMEORIGIN'}


In [183]:
#access the content of the page
source = access.content

#Pass the content through beautiful soup making a soup object
soup = BeautifulSoup(source, 'lxml')

#Extract all the html <a> tags
links = soup.find_all('a')

Find all the h2 (heading 2) tags in the webpage. Aaccess the a (hyperlink) tag within the the headings. Get the href object (destination url in the hyperlink). Append to a list

In [182]:
urls = []
for h2_tag in soup.find_all('h2'):
    a_tag = h2_tag.find('a')
    urls.append(a_tag.attrs['href'])

#Print the URLS    
urls

['https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-strengthening-americas-cybersecurity-workforce-secure-nation-promote-prosperity/',
 'https://www.whitehouse.gov/briefings-statements/remarks-vice-president-pence-national-day-prayer-service/',
 'https://www.whitehouse.gov/briefings-statements/text-letter-president-selected-congressional-committee-leadership-regarding-defense-spending-nato-members/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-approves-california-disaster-declaration-5/',
 'https://www.whitehouse.gov/briefings-statements/president-donald-j-trump-stands-democracy-venezuela/',
 'https://www.whitehouse.gov/briefings-statements/remarks-vice-president-pence-meeting-ice-baltimore-field-office-leadership/',
 'https://www.whitehouse.gov/briefings-statements/remarks-vice-president-pence-hispanic-american-police-command-officers-association-aguila-awards-luncheon-baltimore-md/',
 'https://www.whitehouse.gov/briefings-statemen

In [189]:
#go to one of the links in the list
access = requests.get(urls[2])

In [190]:
print(access.status_code)
print(access.headers)
source = access.content

200
{'Content-Type': 'text/html; charset=UTF-8', 'Accept-Ranges': 'bytes', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip', 'Date': 'Thu, 02 May 2019 17:44:01 GMT', 'Content-Length': '13478', 'Connection': 'keep-alive', 'Strict-Transport-Security': 'max-age=31536000 ; includeSubDomains ; preload', 'X-Frame-Options': 'SAMEORIGIN'}


In [191]:
soup = BeautifulSoup(source, 'html.parser')

#access the title object in the webpage
print(soup.title)

<title>Text of a Letter from the President to Selected Congressional Committee Leadership Regarding Defense Spending by NATO Members | The White House</title>


# Scraping Data From QZ

In [157]:
#Go to the site and create a soup object
site = requests.get('https://qz.com/africa/latest/')
soup = BeautifulSoup(site.content, 'lxml')



In [158]:
#Find all the article objects, where the links to articles are nested
weblinks=soup.find_all('article')

linkx =[]
#From the weblinks first extract the a tags and get href, which is the actual destination address of hyperlink
for link in weblinks:    
      url = link.contents[0].find_all('a')[0]   
      linkx.append('http://qz.com'+url.get('href'))

In [159]:
#See the links available
link

['http://qz.com/africa/1610723/south-africas-inequality-can-be-fixed-with-a-wealth-tax/',
 'http://qz.com/africa/1610649/caster-semenya-ruling-athlete-may-still-challenge-iaaf-regulations/',
 'http://qz.com/africa/1610360/caster-semenya-testosterone-too-high-for-female-athlete-iaaf/',
 'http://qz.com/africa/1610066/vietnams-rhino-horn-myth-drives-african-poaching/',
 'http://qz.com/africa/1609973/somalia-motorycle-hailing-app-go-launched-in-mogadishu/',
 'http://qz.com/africa/1609397/rainforest-in-ghana-dr-congo-brazil-colombia-rapidly-depleting/',
 'http://qz.com/africa/1609342/vivatech-des-investisseurs-comblent-le-fosse-entre-lafrique-francophone-et-anglophone/',
 'http://qz.com/africa/1609532/cancer-immunotherapies-for-africans-can-help-say-scienists/',
 'http://qz.com/africa/1608806/reunion-island-tops-airbnb-average-occupancy-in-africa/',
 'http://qz.com/africa/1608671/mtn-vodacom-overcharge-poor-south-africans-icasa/']

# Go through each link in the article and extract text as well as other relevant Info

In [168]:
#Create list for storing author's name, title of the article, and the text of the article
author= []
title = []
thearticle=[]

for link in linkx:
    #create array for storing text of each article
    paragraphtext = []
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'lxml')
    #Get the name of the author
    try:
        auth= soup.find(class_='d3284 africa').find('a')
        authname= auth.get_text()
    except:
        authname = 'Anon'
    #get the title of the article
    title_ = soup.find(class_="_21349 africa none _4ca8e")
    thetitle_ = title_.get_text()
    #get the main body of article text
    article = soup.find(class_='_61c55')
    #find all p tags, i.e. paragraphs
    para = article.find_all('p')
    
    #from each paragraph, get the text only, remove the last paragraph(advertisement)
    for paragraph in para[:-1]:
        text = paragraph.get_text()
        paragraphtext.append(text)
    
    #Print paragraph text to see everything works well
    #print(paragraphtext)
    
    #combine all paragraphs of each article to one article and do same for other variable
    thearticle.append(paragraphtext)
    title.append(thetitle_)
    author.append(authname)
    
#Join all the elements of each article to a single string using space as a separator ' '
articles = [''.join (article) for article in thearticle]
#print(articles)
    
#Save the article to data file using Pandas
data = {'Title':title, 
    'Author':author, 
    'PageLink':linkx, 
    'Article':articles}

data

news = pd.DataFrame(data=data)
cols = ['Title', 'Author', 'PageLink', 'Article']
news = news[cols]

news
    

Unnamed: 0,Title,Author,PageLink,Article
0,Economists think South Africa’s persistent ine...,"Ingrid Woolard, Stellenbosch University",http://qz.com/africa/1610723/south-africas-ine...,It’s well-established that South Africa has on...
1,Caster Semenya sees her latest defeat as a sta...,Lynsey Chutel,http://qz.com/africa/1610649/caster-semenya-ru...,"Caster Semenya rarely loses, yet her latest de..."
2,How much testosterone is really too much for a...,"Daniel Kelly, Sheffield Hallam University",http://qz.com/africa/1610360/caster-semenya-te...,"The South African athlete, Caster Semenya, has..."
3,What the Vietnamese really believe about rhino...,"Vu Hoai Nam Dang, University of Copenhagen",http://qz.com/africa/1610066/vietnams-rhino-ho...,Vietnam is one of the world’s largest consumer...
4,Mogadishu now has its first motorcycle hailing...,Abdi Latif Dahir,http://qz.com/africa/1609973/somalia-motorycle...,You can now hail a motorcycle service in the S...
5,Ghana is losing its rainforest faster than any...,Kwasi Gyamfi Asiedu,http://qz.com/africa/1609397/rainforest-in-gha...,Ghana’s rainforest is being lost at an alarmin...
6,Des investisseurs providentiels comblent le fo...,Yomi Kazeem,http://qz.com/africa/1609342/vivatech-des-inve...,Les start-ups opérant en Afrique ont fait l’ob...
7,These scientists are developing immunotherapie...,"Neelakshi Mungra, University of Cape Town",http://qz.com/africa/1609532/cancer-immunother...,The exorbitant costs of cancer drugs make it d...
8,This tropical getaway has Airbnb’s top occupie...,Abdi Latif Dahir,http://qz.com/africa/1608806/reunion-island-to...,When it comes to Airbnb occupancy rates in Afr...
9,Two of Africa’s biggest telcos have expanded a...,Lynsey Chutel,http://qz.com/africa/1608671/mtn-vodacom-overc...,Vodacom and MTN used South Africa as a springb...


You can save the above data frame to csv using pd.DataFrame.to_csv()