In [28]:
# importing dependencies

from splinter import Browser

from bs4 import BeautifulSoup as soupy

import pandas as pd

In [2]:
# chromedriver executable path thru splinter

# executable_path = {'executable_path':'chromedriver'}

# browser = Browser('chrome', **executable_path)


browser = Browser('chrome', **{'executable_path':'chromedriver'})


In [3]:
# visit mars nasa site, TOS is more leniant on web scrapping

nasa_url = 'https://mars.nasa.gov/news/'

browser.visit(nasa_url)

# optional delay for loading page

browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1)

# .is_element_present_by_css("ul.item_list li.slide"...
# searches for specific combos of tag (ul, id) with attribute (item_list, slide)
# search result should be <li class="slide"> within  <ul class=”item_list”>

# wait_time=1 adds 1sec delay for heavy dynamic pages

True

In [4]:
# html parser

parse_html = browser.html

news_soup = soupy(parse_html, 'html.parser')

slide_elem = news_soup.select_one('ul.item_list li.slide') # parent element, holds other elements to furhtur filter

# css returns results right to left, so <li class='slide'> will be matches first

In [12]:
slide_elem.find('div', class_='content_title')

# find_all will yield all elements under that class list, hence all summaries, .find yields the first one

<div class="content_title"><a href="/news/8724/nasa-ula-launch-mars-2020-perseverance-rover-mission-to-red-planet/" target="_self">NASA, ULA Launch Mars 2020 Perseverance Rover Mission to Red Planet</a></div>

In [11]:
# find latest news title

news_title = slide_elem.find('div',class_='content_title').get_text()

news_title

'NASA, ULA Launch Mars 2020 Perseverance Rover Mission to Red Planet'

In [13]:
# find latest summary

news_teaser_sum = slide_elem.find('div',class_='article_teaser_body').get_text()

news_teaser_sum


"The agency's Mars 2020 mission is on its way. It will land at Jezero Crater in about seven months, on Feb. 18, 2021. "

### Featured Images

In [19]:
# trying to automate splinter into getting the full sized featured image

# visiting JPL images and retrieving featured images

url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

In [20]:
# getting to featured image
full_image_elem = browser.find_by_id('full_image')

full_image_elem.click()


In [24]:
# more info is found, checked, stored as reference, and accessed with click

browser.is_element_present_by_text('more info', wait_time=1)
# searches for element with provided text, returns a bool

more_info_elem = browser.links.find_by_partial_text('more info')
# uses 'more info' to find associated link

more_info_elem.click()

In [23]:
# parsing full image html with soupy to scrape full-size image url

parse_html = browser.html

full_img_soup = soupy(parse_html, 'html.parser' )


In [27]:
# finding the latest image every time, instead of the current image present, to avoid relying on the URL

latest_image_full = full_img_soup.select_one('figure.lede a img').get("src")

latest_imgurl = f"https://www.jpl.nasa.gov{latest_image_full}"

latest_imgurl #resulted link will always be changing in regards to the featured image

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17832_hires.jpg'

In [35]:
# using pd readHTML function to read an entire Mars facts html table

mars_df = pd.read_html('https://space-facts.com/mars/')[0]
# read_html pulls all tables in a given link turns into a df, [0] narrows down to first table

mars_df.columns = ['description','value'] # adds column names


mars_df.set_index('description', inplace=True) # set column index


mars_df


Unnamed: 0_level_0,value
description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [36]:
mars_df.to_html() #incase converting df back to html

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>value</th>\n    </tr>\n    <tr>\n      <th>description</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Equatorial Diameter:</th>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>Polar Diameter:</th>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>Orbit Distance:</th>\n      <td>227,943,824 km (1.38 AU)</td>\n    </tr>\n    <tr>\n      <th>Orbit Period:</th>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>Surface Temperature:</th>\n      <td>-87 to -5 °C</td>\n    </tr>\n    <tr>\n      <th>First Record:</th>\n      <td>2nd millennium BC</td>\n    </tr>\n    <tr>\n      <th>Recorded By:</th>\n      <td>Egyptian astronomers</td>\n    </tr>\

In [37]:
browser.quit()