# Some Web Sracpping examples:

#### Check status of request.response

In [2]:
import requests
URL = 'https://www.learningcaregroup.com/about-us/weekly-activities/'
response = requests.get(URL)
if response.status_code == 200:
    print('Success!')
elif response.status_code == 404:
    print('Not Found.')

Success!


#### Write a Python program to test if a given page is found or not on the server.

In [17]:
import urllib.request
from urllib.error import HTTPError
from urllib.error import URLError
url = "https://ple.com"
try:
    page = urllib.request.urlopen(url)
except HTTPError as err:
    #print(err.code)
    print('Http error')
except URLError as err:
    #print(err.code)
    print('Server not found')
else:
    print(page.read())

Server not found


#### Write a Python program to download and display the content of robot.txt for en.wikipedia.org

In [27]:
import requests
from requests.exceptions import HTTPError


try:
    response = requests.get('https://en.wikipedia.org/robots.txt')
except HTTPError as error:
    print(f'Http Error Occured: {error}')
else:
    print(response.text)

﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: 

#### Write a Python program to get the number of datasets currently listed on data.gov

In [49]:
import requests
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup

def is_good_response(res):
    content_type = res.headers['Content-Type'].lower()
    return (res.status_code == 200 and content_type is not None and content_type.find('html') > -1)

def getResponse(url):
    try:
        response = requests.get(url)
        if is_good_response(response):
            return response.content
        else:
            return None
    except HTTPError as err:
        print(f'Http error occured: {err}')
        return None
    except Exception as otherErr:
        print(f'Other error occured: "{otherErr}')
        return None
   
    
    
page = getResponse('https://www.data.gov/')
if page is not None:
    html = BeautifulSoup(page,'html.parser')
    tag = html.findAll('small')
    dataset = tag[0].find('a').text
    print(f'Number of dataset currently listed on site: {dataset}')

Number of dataset currently listed on site: 211,252 datasets


#### Write a Python program to display the name of the most recently added dataset on data.gov

In [54]:
page = getResponse('https://catalog.data.gov/dataset?q=&sort=metadata_created+desc&as_sfid=AAAAAAWj80lAu9dZwV_56y09MGL1YplGH1Csg643-j5-dgkkrvBJG9eR-Heg_-Arx5kXqqjT0TDeAs0E1aqQG8KLGbuQOP0L6IuNAU1H7P5a4tGs8paezq16zAWCBRVHT3nQv9I%3D&as_fid=e805aac034091bf58e039220f7db38ed02f71245&ext_location=&ext_bbox=&ext_prev_extent=-142.03125%2C8.754794702435618%2C-59.0625%2C61.77312286453146')
if page is not None:
    html = BeautifulSoup(page,'html.parser')
    tag = html.findAll('h3',class_='dataset-heading')
    recent_dataset = tag[0].text
    print(f'Recently added dataset on site: {recent_dataset}')

Recently added dataset on site: 
MagnusonMatthew_A-kd5p_dataset_20200320.docx





#### Write a Python program to get the number of people visiting a U.S. government website right now

In [82]:
import json
pages = requests.get('https://analytics.usa.gov/data/live/realtime.json')
jdata = pages.json()
print(f'The number of people visiting a U.S. government website right now:{jdata["data"][0]["active_visitors"]}')


The number of people visiting a U.S. government website right now:256966


#### Write a Python program get the number of security alerts issued by US-CERT in the current year.

In [95]:
page = getResponse('https://www.us-cert.gov/ncas/alerts/')
html = BeautifulSoup(page,'html.parser')
tag = html.findAll(hreflang ="en")
total_alert = len(tag)
print(f'The number of security alerts issued by US-CERT in the current year: {total_alert}')




the number of security alerts issued by US-CERT in the current year: 30


#### Write a Python program to list all language names and number of related articles in the order they appear in wikipedia.org.

In [103]:
import urllib.request
from bs4 import BeautifulSoup

pagehtml = urllib.request.urlopen('https://www.wikipedia.org/').read()
html = BeautifulSoup(pagehtml,'html.parser')
lang = html.findAll(class_ = 'link-box')
for names in lang:
    print(names.text)



English
6 066 000+ articles


Español
1 594 000+ artículos


日本語
1 202 000+ 記事


Deutsch
2 426 000+ Artikel


Русский
1 618 000+ статей


Français
2 207 000+ articles


Italiano
1 602 000+ voci


中文
1 114 000+ 條目


Português
1 029 000+ artigos


Polski
1 407 000+ haseł



##### Write a Python program to extract and display all the header tags from en.wikipedia.org/wiki/Main_Page.

In [104]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('https://en.wikipedia.org/wiki/Main_Page')
bs = BeautifulSoup(html, "html.parser")
titles = bs.find_all(['h1', 'h2','h3','h4','h5','h6'])
print('List all the header tags :', *titles, sep='\n\n')

List all the header tags :

<h1 class="firstHeading" id="firstHeading" lang="en">Main Page</h1>

<h2 id="mp-tfa-h2" style="margin:0.5em; background:#cef2e0; font-family:inherit; font-size:120%; font-weight:bold; border:1px solid #a3bfb1; color:#000; padding:0.2em 0.4em;"><span id="From_today.27s_featured_article"></span><span class="mw-headline" id="From_today's_featured_article">From today's featured article</span></h2>

<h2 id="mp-dyk-h2" style="clear:both; margin:0.5em; background:#cef2e0; font-family:inherit; font-size:120%; font-weight:bold; border:1px solid #a3bfb1; color:#000; padding:0.2em 0.4em;"><span class="mw-headline" id="Did_you_know_...">Did you know ...</span></h2>

<h2 id="mp-itn-h2" style="margin:0.5em; background:#cedff2; font-family:inherit; font-size:120%; font-weight:bold; border:1px solid #a3b0bf; color:#000; padding:0.2em 0.4em;"><span class="mw-headline" id="In_the_news">In the news</span></h2>

<h2 id="mp-otd-h2" style="clear:both; margin:0.5em; background:#ce