# Webscraping fetching data for Covid 19

In [2]:
import IPython

from IPython.display import HTML
from IPython.display import display

import requests

page = requests.get("https://www.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6")
page

<Response [200]>

In [3]:
page.status_code

200

### The html is saved within the 'content' attribute

In [4]:
page.content

b'<!DOCTYPE html>\n<html>\n\n<head>\n  <meta charset="utf-8">\n  <meta http-equiv="X-UA-Compatible" content="IE=edge">\n  <title>ArcGIS Dashboards</title>\n  <meta name="description" content>\n  <meta name="viewport" content="width=device-width, initial-scale=1">\n  <link rel="icon" href="assets/images/favicon.ico?" type="image/x-icon">\n  <link href="https://js.arcgis.com/3.32/dijit/themes/claro/claro.css" rel="stylesheet" type="text/css">\n  <link href="https://js.arcgis.com/3.32/esri/css/esri.css" rel="stylesheet" type="text/css">\n  <link rel="stylesheet" href="assets/vendor-ff6a5e0c0264e398e1ffaeb015926635.css">\n  <link rel="stylesheet" href="assets/app-light-7137f008b303d663c3645f07f162e89f.css">\n  <script src="assets/amd-config-7e9801fc9c916a27bb75c6f356e09e0d.js"></script>\n</head>\n\n<body class="claro">\n  <script src="https://js.arcgis.com/3.32/init.js" data-amd="true"></script>\n  <script src="assets/amd-loading-d8029d0343fa400ebae9865c42984750.js" data-amd-loading="true"

## Beautiful Soup
### Soupify (i.e., parse) the page object

In [5]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
soup

<!DOCTYPE html>

<html>
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title>ArcGIS Dashboards</title>
<meta content="" name="description"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="assets/images/favicon.ico?" rel="icon" type="image/x-icon"/>
<link href="https://js.arcgis.com/3.32/dijit/themes/claro/claro.css" rel="stylesheet" type="text/css"/>
<link href="https://js.arcgis.com/3.32/esri/css/esri.css" rel="stylesheet" type="text/css"/>
<link href="assets/vendor-ff6a5e0c0264e398e1ffaeb015926635.css" rel="stylesheet"/>
<link href="assets/app-light-7137f008b303d663c3645f07f162e89f.css" rel="stylesheet"/>
<script src="assets/amd-config-7e9801fc9c916a27bb75c6f356e09e0d.js"></script>
</head>
<body class="claro">
<script data-amd="true" src="https://js.arcgis.com/3.32/init.js"></script>
<script data-amd-loading="true" src="assets/amd-loading-d8029d0343fa400ebae9865c42984750.js"></script>
<div class="full-height f

In [7]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   ArcGIS Dashboards
  </title>
  <meta content="" name="description"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="assets/images/favicon.ico?" rel="icon" type="image/x-icon"/>
  <link href="https://js.arcgis.com/3.32/dijit/themes/claro/claro.css" rel="stylesheet" type="text/css"/>
  <link href="https://js.arcgis.com/3.32/esri/css/esri.css" rel="stylesheet" type="text/css"/>
  <link href="assets/vendor-ff6a5e0c0264e398e1ffaeb015926635.css" rel="stylesheet"/>
  <link href="assets/app-light-7137f008b303d663c3645f07f162e89f.css" rel="stylesheet"/>
  <script src="assets/amd-config-7e9801fc9c916a27bb75c6f356e09e0d.js">
  </script>
 </head>
 <body class="claro">
  <script data-amd="true" src="https://js.arcgis.com/3.32/init.js">
  </script>
  <script data-amd-loading="true" src="assets/amd-loading-d8029d0343fa400ebae9865c4298475

### Here are some simple ways to navigate that data structure:

In [8]:
soup.title

<title>ArcGIS Dashboards</title>

In [9]:
soup.title.name

'title'

In [10]:
soup.title.string

'ArcGIS Dashboards'

In [11]:
soup.title.parent.name

'head'

In [12]:
soup.p

---
### Examine all elements at the top level of the page using the children property of soup.
##### Note: the children command returns a list generator so we need to call the list function on it.

In [13]:
list(soup.children)

['html', '\n', <html>
 <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
 <title>ArcGIS Dashboards</title>
 <meta content="" name="description"/>
 <meta content="width=device-width, initial-scale=1" name="viewport"/>
 <link href="assets/images/favicon.ico?" rel="icon" type="image/x-icon"/>
 <link href="https://js.arcgis.com/3.32/dijit/themes/claro/claro.css" rel="stylesheet" type="text/css"/>
 <link href="https://js.arcgis.com/3.32/esri/css/esri.css" rel="stylesheet" type="text/css"/>
 <link href="assets/vendor-ff6a5e0c0264e398e1ffaeb015926635.css" rel="stylesheet"/>
 <link href="assets/app-light-7137f008b303d663c3645f07f162e89f.css" rel="stylesheet"/>
 <script src="assets/amd-config-7e9801fc9c916a27bb75c6f356e09e0d.js"></script>
 </head>
 <body class="claro">
 <script data-amd="true" src="https://js.arcgis.com/3.32/init.js"></script>
 <script data-amd-loading="true" src="assets/amd-loading-d8029d0343fa400ebae9865c42984750.js"></script>
 <div class

#### The first line of the output tells us that there are two tags at the top level of the page
* !DOCTYPE html 
* html 
* There is a newline (\n) in the list as well

---
### Examine the type of element in the top-level-tags list

In [14]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

#### "bs4." denotes that each element in the list is a BeautifulSoup object
* Doctype - information about the type of document 
* NavigableString - represents text found in the html document
* Tag - the important one ;) often contains other nested tags. 
---
### Select the html tag and its children by taking the third item in the list
##### Note: index elements begin at 0, thus the third element is 2

In [15]:
html = list(soup.children)[2]

### Find the children nested within the html tag

In [16]:
list(html.children)

['\n', <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
 <title>ArcGIS Dashboards</title>
 <meta content="" name="description"/>
 <meta content="width=device-width, initial-scale=1" name="viewport"/>
 <link href="assets/images/favicon.ico?" rel="icon" type="image/x-icon"/>
 <link href="https://js.arcgis.com/3.32/dijit/themes/claro/claro.css" rel="stylesheet" type="text/css"/>
 <link href="https://js.arcgis.com/3.32/esri/css/esri.css" rel="stylesheet" type="text/css"/>
 <link href="assets/vendor-ff6a5e0c0264e398e1ffaeb015926635.css" rel="stylesheet"/>
 <link href="assets/app-light-7137f008b303d663c3645f07f162e89f.css" rel="stylesheet"/>
 <script src="assets/amd-config-7e9801fc9c916a27bb75c6f356e09e0d.js"></script>
 </head>, '\n', <body class="claro">
 <script data-amd="true" src="https://js.arcgis.com/3.32/init.js"></script>
 <script data-amd-loading="true" src="assets/amd-loading-d8029d0343fa400ebae9865c42984750.js"></script>
 <div class="full-hei

In [17]:
[type(item) for item in list(html.children)]

[bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

### There are two tags here
* head 
* body 
---
### We can find the p tag by finding the children of the body tag

In [18]:
body = list(html.children)[3]

In [19]:
list(body.children)

['\n',
 <script data-amd="true" src="https://js.arcgis.com/3.32/init.js"></script>,
 '\n',
 <script data-amd-loading="true" src="assets/amd-loading-d8029d0343fa400ebae9865c42984750.js"></script>,
 '\n',
 <div class="full-height flex-vertical flex-justify-center flex-align-items-center" id="initialLoadingContainer">
 <div class="loader is-active">
 <div class="loader-bars"></div>
 </div>
 </div>,
 '\n']

In [20]:
[type(item) for item in list(body.children)]

[bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString,
 bs4.element.Tag,
 bs4.element.NavigableString]

### Isolate the p tag

In [21]:
p = list(body.children)[1]

### Once isolated, extract all of the text using get_text method

In [22]:
p.get_text()

''

## Finding all instances of a tag

In [23]:
soup = BeautifulSoup(page.content, 'html.parser')
soup.find_all('p')

[]

##### Note: find_all returns a list, so we have to loop through it or use list indexing to extract text. 
---
#### The find method will resturing a single BeautifulSoup object with the first instance of a tag

In [24]:
soup.find('p')

## Searching for tags by class and id
##### Note: These properties are used to uniformly apply certain styles of formatting for related parts of a webpage

In [25]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
 </body>
</html>


#### search for any p tag that has the class outer-text

In [26]:
soup.find_all('p', class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

search for all elements with id first

In [27]:
soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

In [28]:
soup.select("div p")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

In [30]:
page = requests.get("https://www.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6")
#soup = BeautifulSoup(page.content, 'html.parser')  
#seven_day = soup.find(id='seven-day-forecast')
#forecast_items = seven_day.find_all(class_='tombstone-container')
#tonight = forecast_items[0]
#print(tonight.prettify())