In [22]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd

In [23]:
# Set the executable path and initialize the chrome browser in splinter
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path)

In [24]:
# Visit the mars nasa news site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# Optional delay for loading the page
# searching for elements with a specific combination of tag (ul and li) and attribute (item_list and slide, respectively).
# telling our browser to wait one second before searching for components. The optional delay is useful because sometimes 
# dynamic pages take a little while to load, especially if they are image-heavy.
browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

True

In [25]:
# set up the HTML parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('ul.item_list li.slide')

In [26]:
# Notice how we've assigned slide_elem as the variable to look for the <ul /> tag 
# and its descendent (the other tags within the <ul /> element), the <li /> tags? 
# This is our parent element. This means that this element holds all of the other 
# elements within it, and we'll reference it when we want to filter search results 
# even further. The . is used for selecting classes, such as item_list, so the code 
# 'ul.item_list li.slide' pinpoints the <li /> tag with the class of slide and the 
# <ul /> tag with a class of item_list. CSS works from right to left, such as 
# returning the last item on the list instead of the first. Because of this, when 
# using select_one, the first matching element returned will be a <li /> element with
# a class of slide and all nested elements within it.

In [27]:
# assign the title and summary text to variables we'll reference later. 
# In the next empty cell, let's begin our scraping.
slide_elem.find("div", class_='content_title')

<div class="content_title"><a href="/news/8882/nasas-perseverance-drives-on-mars-terrain-for-first-time/" target="_self">NASA's Perseverance Drives on Mars' Terrain for First Time</a></div>

In [28]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find("div", class_='content_title').get_text()
news_title

"NASA's Perseverance Drives on Mars' Terrain for First Time"

In [29]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_="article_teaser_body").get_text()
news_p

'The first trek of the agency’s largest, most advanced rover yet on the Red Planet marks a major milestone before science operations get under way.'

In [30]:
# 10.3.4 scrape mas dat: featured image
 ### Featured Images

 ### Featured Images
    #  Chaning format of code cell from "code" to markdown by dropdown at the top of jupyter notebook

In [31]:
# Visit URL
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

In [32]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()
# Notice the indexing chained at the end of the first line of code? 
# With this, we've stipulated that we want our browser to click the second button.

In [33]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')


In [34]:
# Find the relative image url
# An img tag is nested within this HTML, so we've included it.
# .get('src') pulls the link to the image.
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars3.jpg'

In [35]:
# Use the base URL to create an absolute URL
img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'
img_url

'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/image/featured/mars3.jpg'

In [36]:
# creating a new DataFrame from the HTML table.
# Pandas function read_html() specifically searches for and returns a 
# list of tables found in the HTML
# index of 0, we're telling Pandas to pull only the first table it encounters, 
# or the first item in the list.
df = pd.read_html('http://space-facts.com/mars/')[0]
# assign columns to the new DataFrame for additional clarity.
df.columns=['description', 'value']
# .set_index() function, we're turning the Description column into the DataFrame's index
# inplace=True means that the updated index will remain in place, without having to 
# reassign the DataFrame to a new variable.
df.set_index('description', inplace=True)
df

Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [37]:
# Pandas also has a way to easily convert our DataFrame back into HTML-ready 
# code using the .to_html() function
df.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [38]:
# important line to add to our web app
browser.quit()