<a href="https://colab.research.google.com/github/QuothTheRaven42/learning_files/blob/master/webscraping_with_dataquest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# more webscraping practice before Thinkful bootcamp starts

In [1]:
# downloading a web site
import requests 

# reponse codes starting with 2 indicate success
# codes starting with 4 or 5 indicate an error
page = requests.get('http://dataquestio.github.io/web-scraping-pages/simple.html')
page.status_code

200

In [2]:
# printing html content
page.content

b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'

In [3]:
from bs4 import BeautifulSoup as bs

# could have the 'url' in place of downloaded html content variable
soup = bs(page.content, 'html.parser')

# adding whitespace to html for readability with prettify function 
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [4]:
# 'children' returns a generator iterable, so it needs a list
list(soup.children)

['html', '\n', <html>
 <head>
 <title>A simple example page</title>
 </head>
 <body>
 <p>Here is some simple content for this page.</p>
 </body>
 </html>]

In [5]:
# all html elements are now BeautifulSoup objects
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [6]:
# returns a list of all occurences, index always required
soup.find_all('p')[0].get_text()

'Here is some simple content for this page.'

In [7]:
# finding first occurence, as a single object - not a list
soup.find('p').get_text()

'Here is some simple content for this page.'

# **Searching by class and id**

In [8]:
page = requests.get('http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html')
soup = bs(page.content, 'html.parser')
print(soup)

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                First paragraph.
            </p>
<p class="inner-text">
                Second paragraph.
            </p>
</div>
<p class="outer-text first-item" id="second">
<b>
                First outer paragraph.
            </b>
</p>
<p class="outer-text">
<b>
                Second outer paragraph.
            </b>
</p>
</body>
</html>


In [9]:
# find where the 'p' tag has the 'outer-text' class
soup.find_all('p', 'outer-text')
# find_all includes classes that had more to their strings

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [10]:
# it's a bs4 list-type object, you can add index to the end 
soup.find_all('p', 'outer-text')[1].get_text().strip()
# got the text and stripped all the new lines and white space

'Second outer paragraph.'

In [11]:
# must call class_ explicitly if you're only looking for the class itself
soup.find_all(class_='outer-text')

[<p class="outer-text first-item" id="second">
 <b>
                 First outer paragraph.
             </b>
 </p>, <p class="outer-text">
 <b>
                 Second outer paragraph.
             </b>
 </p>]

In [12]:
# finding by id
soup.find_all(id='first')

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

# **Using CSS Selectors**

* *p a* — finds all a tags inside of a p tag.
* *body p a* — finds all a tags inside of a p tag inside of a body tag.
* *html body* — finds all body tags inside of an html tag.
* *p.outer-text* — finds all p tags with a class of outer-text.
* *p#first* — finds all p tags with an id of first.
* *body p.outer-text* — finds any p tags with a class of outer-text inside of a body tag.



In [13]:
# searching a page via CSS selectors using the select method
soup.select('div p') # all p tags inside of div 
# returns a list

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>, <p class="inner-text">
                 Second paragraph.
             </p>]

In [14]:
soup.select('div p')[0].get_text().strip()

'First paragraph.'

# **Downloading Weather Data**

In [15]:
page = requests.get('https://forecast.weather.gov/MapClick.php?lat=32.1378&lon=-94.8023#.Xw-zip5KjBU')
page.status_code

200

In [16]:
soup = bs(page.content, 'html.parser')

In [17]:
# list(soup.children) tombstones.find_all('p', 'period-name')

In [18]:
tombstones1 = soup.select('div p.period-name')[1].get_text()

In [19]:
tombstones2 = soup.select('div p.short-desc')[1].get_text()

In [20]:
tombstones3 = soup.select('div p.temp.temp-low')[0].get_text()

In [21]:
# for each in range(1,8):
#     if 'Tonight' in tombstones[each].get_text() or 'Today' in tombstones[each].get_text():
#         print(tombstones[each].get_text().strip())

In [22]:
print(tombstones1, tombstones2, tombstones3, sep='\n')

Tonight
Partly Cloudy
Low: 77 °F


In [23]:
soup.find('img')

<img alt="National Oceanic and Atmospheric Administration" src="/css/images/header_noaa.png"/>

In [24]:
week = soup.find(id='seven-day-forecast')
today = week.find_all(class_='tombstone-container')[1]
print(today.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Tonight
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Tonight: Partly cloudy, with a low around 77. South wind 5 to 10 mph. " class="forecast-icon" src="newimages/medium/nsct.png" title="Tonight: Partly cloudy, with a low around 77. South wind 5 to 10 mph. "/>
 </p>
 <p class="short-desc">
  Partly Cloudy
 </p>
 <p class="temp temp-low">
  Low: 77 °F
 </p>
</div>


In [25]:
today.find(class_='period-name').get_text()

'Tonight'

In [26]:
today.find(class_='temp').get_text()

'Low: 77 °F'

In [27]:
today.find(class_='short-desc').get_text()

'Partly Cloudy'

In [28]:
today.find('img')['title']

'Tonight: Partly cloudy, with a low around 77. South wind 5 to 10 mph. '

In [29]:
print(week.prettify())

<div class="panel panel-default" id="seven-day-forecast">
 <div class="panel-heading">
  <b>
   Extended Forecast for
  </b>
  <h2 class="panel-title">
   Henderson TX
  </h2>
 </div>
 <div class="panel-body" id="seven-day-forecast-body">
  <div id="seven-day-forecast-container">
   <div class="current-hazard" id="headline-container" style="margin-left: 124px">
    <div id="headline-separator" style="top: 34px; height: 131px">
    </div>
    <div id="headline-info" onclick="$('#headline-detail').toggle(); $('#headline-detail-now').hide()" style="margin-top: 5px">
     <div id="headline-detail">
      <div>
       Heat Advisory until July 16, 07:00pm
      </div>
     </div>
     <span class="fa fa-info-circle">
     </span>
     Click here for hazard details and duration
    </div>
    <div class="headline-bar headline-advisory " style="top: 40px; left: 19px; height: 125px; width: 239px">
     <div class="headline-title">
      Heat Advisory
     </div>
    </div>
   </div>
   <ul clas

In [30]:
periods = week.find_all(class_='period-name')
days = [each.get_text() for each in periods][1:]
print(days)

['Tonight', 'Thursday', 'ThursdayNight', 'Friday', 'FridayNight', 'Saturday', 'SaturdayNight', 'Sunday']


In [31]:
temps = week.find_all(class_='temp')
temp = [each.get_text() for each in temps][:]
print(temp)

['Low: 77 °F', 'High: 95 °F', 'Low: 75 °F', 'High: 95 °F', 'Low: 75 °F', 'High: 95 °F', 'Low: 75 °F', 'High: 94 °F']


In [32]:
descs = week.find_all(class_='short-desc')
desc = [each.get_text() for each in descs][1:]
print(desc)

['Partly Cloudy', 'Hot', 'Mostly Clear', 'Sunny thenSlight ChanceT-storms', 'Mostly Clear', 'Hot', 'Mostly Clear', 'Sunny']


In [33]:
all = week.find_all('img')
full = [each['title'] for each in all][1:]

In [34]:
import pandas as pd
dictionary = {'day': days, 'temp': temp, 'weather': desc, 'full': full}

df = pd.DataFrame(dictionary)
df.set_index('day')

Unnamed: 0_level_0,temp,weather,full
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tonight,Low: 77 °F,Partly Cloudy,"Tonight: Partly cloudy, with a low around 77. ..."
Thursday,High: 95 °F,Hot,"Thursday: Mostly sunny and hot, with a high ne..."
ThursdayNight,Low: 75 °F,Mostly Clear,"Thursday Night: Mostly clear, with a low aroun..."
Friday,High: 95 °F,Sunny thenSlight ChanceT-storms,Friday: A 20 percent chance of showers and thu...
FridayNight,Low: 75 °F,Mostly Clear,"Friday Night: Mostly clear, with a low around ..."
Saturday,High: 95 °F,Hot,"Saturday: Sunny and hot, with a high near 95."
SaturdayNight,Low: 75 °F,Mostly Clear,"Saturday Night: Mostly clear, with a low aroun..."
Sunday,High: 94 °F,Sunny,"Sunday: Sunny, with a high near 94."


In [35]:
# import regex as re
# temp_nums = re.compile(r'\d*')

# for each in df:
#     m = re.match(temp_nums, each)
#     print(m)
temp_nums = df["temp"].str.extract("(?P<temp_num>\d+)", expand=False)
df["temp_num"] = temp_nums.astype('int')
df

Unnamed: 0,day,temp,weather,full,temp_num
0,Tonight,Low: 77 °F,Partly Cloudy,"Tonight: Partly cloudy, with a low around 77. ...",77
1,Thursday,High: 95 °F,Hot,"Thursday: Mostly sunny and hot, with a high ne...",95
2,ThursdayNight,Low: 75 °F,Mostly Clear,"Thursday Night: Mostly clear, with a low aroun...",75
3,Friday,High: 95 °F,Sunny thenSlight ChanceT-storms,Friday: A 20 percent chance of showers and thu...,95
4,FridayNight,Low: 75 °F,Mostly Clear,"Friday Night: Mostly clear, with a low around ...",75
5,Saturday,High: 95 °F,Hot,"Saturday: Sunny and hot, with a high near 95.",95
6,SaturdayNight,Low: 75 °F,Mostly Clear,"Saturday Night: Mostly clear, with a low aroun...",75
7,Sunday,High: 94 °F,Sunny,"Sunday: Sunny, with a high near 94.",94


In [36]:
# average daily high
df.temp_num[1::2].mean()

94.75

In [37]:
# only night rows
is_night = df.temp.str.contains('Low')
df[is_night]

Unnamed: 0,day,temp,weather,full,temp_num
0,Tonight,Low: 77 °F,Partly Cloudy,"Tonight: Partly cloudy, with a low around 77. ...",77
2,ThursdayNight,Low: 75 °F,Mostly Clear,"Thursday Night: Mostly clear, with a low aroun...",75
4,FridayNight,Low: 75 °F,Mostly Clear,"Friday Night: Mostly clear, with a low around ...",75
6,SaturdayNight,Low: 75 °F,Mostly Clear,"Saturday Night: Mostly clear, with a low aroun...",75
