In [1]:
# The data Robin wants to collect from this particular website is the most recent news article along with its summary. 
# Remember, the code for this will eventually be used in an application that will scrape live data with 
# the click of a button—this site is dynamic and the articles will change frequently, 
# which is why Robin is removing the manual task of retrieving each new article.

In [17]:
# Import Splinter, BeautifulSoup and Pandas
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd

In [3]:
# Set up Splinter - to set the executable path and initialize a browser
# This means that we're prepping our automated browser. We're also specifying that we'll be using Chrome as our browser.

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - 

[WDM] - Current google-chrome version is 95.0.4638
[WDM] - Get LATEST driver version for 95.0.4638
[WDM] - Driver [C:\Users\lekan\.wdm\drivers\chromedriver\win32\95.0.4638.17\chromedriver.exe] found in cache


In [4]:
# Visit the mars nasa news site
# This code tells Splinter which site we want to visit by assigning the link to a URL - the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
# This tells our browser to wait one second before searching for components 
# because sometimes dynamic pages take a little while to load, especially if they are image-heavy.
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

In [5]:
# Use BeautifulSoup to parse the HTML
# That means that BeautifulSoup has taken a look at the different components and can now access them. 
# Specifically, BeautifulSoup parses the HTML text and then stores it as an object.
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

In [6]:
# Assign the title and summary text to variables we'll reference later. 
slide_elem.find('div', class_='content_title')

# In this line of code, we chained .find onto our previously assigned variable, slide_elem. 
# When we do this, we're saying, "This variable holds a ton of information, 
# so look inside of that information to find this specific data." 
# The data we're looking for is the content title, which we've specified by saying, 
# "The specific data is in a <div /> with a class of 'content_title'."

# The output should be the HTML containing the content title and anything else nested inside of that <div />.

<div class="content_title">NASA Invites Students to Name Mars 2020 Rover</div>

In [7]:
# Scrape First Article Title (Most Recent Article)

# The title is in that mix of HTML in our output—that's awesome! But we need to get just the text, 
# and the extra HTML stuff isn't necessary. The following code resolves this issue:

# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

# We've added something new to our .find() method here: .get_text(). 
# When this new method is chained onto .find(), only the text of the element is returned. 
# The code above, for example, would return only the title of the news article and not any of the HTML tags or elements. 
# When this new method is chained onto .find(), only the text of the element is returned. 
# The code above, for example, would return only the title of the news article and not any of the HTML tags or elements.

'NASA Invites Students to Name Mars 2020 Rover'

In [8]:
# Scrape First Article Summary

# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

"Through Nov. 1, K-12 students in the U.S. are encouraged to enter an essay contest to name NASA's next Mars rover."

### Featured Images

In [9]:
# Robin's next step scraping code will be to gather the featured images 
# from the Jet Propulsion Laboratory's Space Images webpage

# The first image that pops up on the webpage is the featured image. Robin wants the full-size version of this image, 
# so we know we'll want Splinter to click the "Full Image" button. From there, the page directs us to a slideshow. 
# It's a little closer to getting the full-size feature image, but we aren't quite there yet.

# This is a lot of clicking to get to the image we want. Let's start getting our code ready to automate all of the clicks.

In [11]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [12]:
# Next, we want to click the "Full Image" button. This button will direct our browser to an image slideshow.
# Find and click the full image button

full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [13]:
# With the new page loaded onto our automated browser, it needs to be parsed so we can continue 
# and scrape the full-size image URL. 

# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [14]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

# This looks great! We were able to pull the link to the image by pointing BeautifulSoup to where the image will be, 
# instead of grabbing the URL directly. 

'image/featured/mars2.jpg'

In [15]:
# The URL above is incomplete. We are using f-string to make it complete
# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

In [16]:
# The next bit of information Robin wants to have included in her app is a collection of Mars facts. 
# With news articles and high-quality images, a collection of facts is a solid addition to her web app.
# She's collecting this data from https://galaxyfacts-mars.com/
# Robin already has a great photo and an article, so all she wants from this page is the table.
# Tables in HTML are basically made up of many smaller containers. The main container is the <table /> tag. 
# Inside the table is <tbody />, which is the body of the table—the headers, columns, and rows.
# <tr /> is the tag for each table row. Within that tag, the table data is stored in <td /> tags. 
# This is where the columns are established.

In [18]:
# Instead of scraping each row, or the data in each <td />, we're going to scrape the entire table 
# with Pandas' .read_html() function.
df = pd.read_html('https://galaxyfacts-mars.com')[0]
df.columns=['description', 'Mars', 'Earth']
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [19]:
# Our data is live—if the table is updated, then we want that change to appear in Robin's app also.
# Thankfully, Pandas also has a way to easily convert our DataFrame back into HTML-ready code using the .to_html() function.

df.to_html()


'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

In [21]:
# Now that we've gathered everything on Robin's list, we can end the automated browsing session. 
# This is an important line to add to our web app also. Without it, the automated browser won't know 
# to shut down—it will continue to listen for instructions and use the computer's resources 
# (it may put a strain on memory or a laptop's battery if left on)

browser.quit()

In [None]:
# We can't automate the scraping using the Jupyter Notebook. To fully automate it, 
# it will need to be converted into a .py file.
# It won't transition over perfectly, we'll need to clean it up a bit, but it's an easier task 
# than copying each cell and pasting it over in the correct order.