In [1]:
# Dependencies - - - - - - - - - - - - - Randy Dettmer 2020/04/20
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
import pandas as pd

In [2]:
# Set browser to Chrome
executable_path = {"executable_path": "chromedriver.exe"}
browser = Browser("chrome", **executable_path, headless=False)

# NASA Mars News

In [3]:
# Vist NASA mars web site
url = "https://mars.nasa.gov/news/"
browser.visit(url)

In [4]:
# I had issues with responsiveness of this site so I added a delay for all of the html to load
time.sleep(2)

In [5]:
# Scrape page into soup - I used lxml because it seemed I received more consistant responses over html.parser
html = browser.html
soup = bs(html, "lxml")

In [6]:
# Print soup to identify tags needed - turned off for deployment
#print(soup.prettify())
#tags needed <div class='content_title'> and <div class='article_teaser-body'>

In [7]:
# Get latest news title - get actual title - - 
# Index [0] returns "Mars Now" which is the overall title for the page not the news title
news_title = soup.find_all('div', class_='content_title')[1].text

In [8]:
# Print news title
print(f'News Title: {news_title}')

News Title: NASA's New Mars Rover Will Use X-Rays to Hunt Fossils


In [9]:
# Get paragraph text
news_p = soup.find('div', class_='article_teaser_body').get_text()

In [10]:
# Print news paragraph
print(f'News Paragraph: {news_p}')

News Paragraph: PIXL, an instrument on the end of the Perseverance rover's arm, will search for chemical fingerprints left by ancient microbes.


# JPL Mars Space Images - Featured Image

In [11]:
# Visit JPL site for current Mars space image
image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(image_url)
html = browser.html
soup = bs(html, "lxml")

In [12]:
# Print soup to identify tag needed - turned off for deployment
#print(soup.prettify())

In [13]:
# Click on button to get full size image
full_image_button = browser.find_by_id("full_image")
full_image_button.click()

In [14]:
# Find full image button anc click it
browser.is_element_not_present_by_text("more info", wait_time=1)
more_info_button = browser.find_link_by_partial_text("more info")
more_info_button.click()

In [15]:
# Scrape the first Mars image
image = soup.find("img", class_="thumb")["src"]
featured_image_url = "https://www.jpl.nasa.gov" + image

In [16]:
# Show image url
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA24096-640x350.jpg


# Mars Weather

In [17]:
# Visit the Mars Weather Twitter Account - tested and failed - issues with reactive Java Script
#weather_url = "https://twitter.com/marswxreport?lang=en"
#browser.visit(weather_url)
#html = browser.html
#soup = bs(html, "html.parser")

In [18]:
# Visit the Mars Weather from NASA - alternative site used to collect the weather information for Mars
weather_url = "https://mars.nasa.gov/insight/weather/"
browser.visit(weather_url)
html = browser.html
soup = bs(html, "lxml")

In [19]:
# Mars Weather
#print soup to idenitfy tags needed - turned off for deployment
#print(soup.prettify())

In [20]:
#soup.tbody.find_all("span") - shows everything for temperature and wind speed but no wind direction or pressure

In [21]:
results = soup.find_all("span")

In [22]:
# Maximum Temperature - ???
# Index through the temperature information
soup.tbody.find_all("span", class_='fahrenheit')[0].text

'5.1° F'

In [23]:
# Average Temperature
soup.tbody.find_all("span", class_='fahrenheit')[1].text

'-69.3° F'

In [24]:
# Maximum Temperature
soup.tbody.find_all("span", class_='fahrenheit')[2].text

'-141° F'

In [25]:
# Maximum Wind Speed
# Index through the wind speed information
soup.tbody.find_all("span", class_='mph')[0].text

'41.4'

In [26]:
# Average Wind Speed
soup.tbody.find_all("span", class_='mph')[1].text

'15.7'

In [27]:
# Minimum Wind Speed
soup.tbody.find_all("span", class_='mph')[2].text

'1.8'

In [28]:
# Wind Direction - change tag and class
soup.tbody.find("td", class_='windspeed point').text

'WNW'

In [29]:
# Pressure Maximum - change tag and class
# Index [0] returns "Max" which is the overall title for the data not the data itself
soup.tbody.find_all("td", class_='pressure max')[1].text

'785'

In [30]:
# Pressure Average
soup.tbody.find_all("td", class_='pressure avg')[1].text

'764.7'

In [31]:
# Pressure Minimum
soup.tbody.find_all("td", class_='pressure min')[1].text

'738.2'

# Mars Facts

In [32]:
# Visit the Mars facts webpage
facts_url = "https://space-facts.com/mars/"
browser.visit(facts_url)

In [33]:
# Create pandas df with Mars facts - - 
# Index [2] returns the cleanest data
df1 = pd.read_html("https://space-facts.com/mars/")[2]
df1.columns=["Fact", "Value"]
df1=df1.set_index("Fact")
df1

Unnamed: 0_level_0,Value
Fact,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


# Mars Hemispheres

In [34]:
# Visit the Mars hemispheres scrape images from the astrogeology.usgs site
hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(hemispheres_url)
html = browser.html
soup = bs(html, "lxml")

In [35]:
# Mars hemispheres
#print soup to idenitfy tags needed - turend off for deployment
#print(soup.prettify())

In [36]:
# Create a dictionary to store data using the keys img_url and title
hemisphere_image_urls = []

In [37]:
# Results are returned as an iterable list
results = soup.find_all("div",class_='item')

In [38]:
# Loop through returned results
for result in results:
    hemisphere = {}
        
    # identifiy title
    title = result.find('h3').text
        
    # identify image link
    key = result.find("a")["href"]
        
    # join link together
    link = "https://astrogeology.usgs.gov/" + key
        
    # read link
    browser.visit(link)
    html = browser.html
    soup = bs(html, "lxml")
        
    # collect information for dictionary and append
    downloads = soup.find("div", class_="downloads")
    image_url = downloads.find("a")["href"]
    hemisphere['title']= title
    hemisphere['image_url']= image_url
    hemisphere_image_urls.append(hemisphere)

In [39]:
# Print a list of the dictionary with the titles and img_urls
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'image_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'image_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'image_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'image_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]