In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
# initialize splinter browser to use right away
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
import pandas as pd

# NASA Mars News

In [2]:
# URL of page to be scraped
news_url = 'https://mars.nasa.gov/news/'
#url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

# Retrieve page with the requests module
# this didn't yield all the articles
# response = requests.get(url)
# Create BeautifulSoup object; parse with 'html.parser'
# soup = bs(response.text, 'html.parser')

# doing the following did yield all the articles
browser.visit(news_url)
html = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
newsSoup = bs(html, 'html.parser')

In [3]:
# Examine the results, then determine element that contains sought info
#print(webpageSoup.prettify())

In [4]:
# results are returned as an iterable list
# title_results = bs.find_all('div', class_="content_title") #does not return a list of strings, but rather a list of objects that we can call further methods on
# title_results

title_results = bs.find_all(newsSoup, 'h3', class_=None)
title_results

[<h3>Five Things to Know About InSight's Mars Landing</h3>,
 <h3>NASA Launches a New Podcast to Mars</h3>,
 <h3>Update on Opportunity Rover Recovery Efforts</h3>,
 <h3>Third ASPIRE Test Confirms Mars 2020 Parachute a Go</h3>,
 <h3>NASA to Host Briefing on November Mars InSight Landing</h3>,
 <h3>NASA's InSight Will Study Mars While Standing Still</h3>,
 <h3>NASA's First Image of Mars from a CubeSat</h3>,
 <h3>The 'Claw Game' on Mars: NASA InSight Plays to Win</h3>,
 <h3>Scientists to Debate Landing Site for Next Mars Rover</h3>,
 <h3>Mars Virtual Reality Software Wins NASA Award</h3>,
 <h3>Painting Cars for Mars</h3>,
 <h3>Curiosity Rover to Temporarily Switch 'Brains'</h3>,
 <h3>Opportunity Emerges in a Dusty Picture</h3>,
 <h3>NASA Seeking Partner in Contest to Name Next Mars Rover</h3>,
 <h3>NASA's MAVEN Selfie Marks Four Years in Orbit at Mars</h3>,
 <h3>MarCO Makes Space for Small Explorers</h3>,
 <h3>Curiosity Surveys a Mystery Under Dusty Skies</h3>,
 <h3>NASA's InSight Has a Th

In [5]:
first_article_title = title_results[0].text.strip()
first_article_title

"Five Things to Know About InSight's Mars Landing"

In [6]:
# these didn't yield the proper results
# results are returned as an iterable list 
# paragraph_results = bs.findall('div', class="rollover_description_inner") #does not return a list of strings, but rather a list of objects that we can call further methods on 

# this worked better
paragraph_results = bs.find_all(newsSoup, 'div', class_='article_teaser_body')
paragraph_results

[<div class="article_teaser_body">NASA engineers will be holding their breath when their spacecraft heads into Mars' atmosphere on Nov. 26.</div>,
 <div class="article_teaser_body">NASA's new eight-episode series 'On a Mission' follows the InSight spacecraft on its journey to Mars and details the extraordinary challenges of landing on the Red Planet.</div>,
 <div class="article_teaser_body">After a review of the progress of the listening campaign, NASA will continue its current strategy for attempting to make contact with the Opportunity rover for the foreseeable future.</div>,
 <div class="article_teaser_body">The supersonic parachute that will handle the heaviest payload yet to the Red Planet – Mars 2020 rover – passes its final sounding rocket test with flying colors.</div>,
 <div class="article_teaser_body">A briefing on NASA's upcoming InSight Mars landing will air on Wed. Oct. 31 at 1:30 p.m. EDT (10:30 a.m. PDT) on NASA TV, the agency's website and NASA InSight Facebook Page.</d

In [7]:
first_article_description = paragraph_results[0].text.strip()
first_article_description

"NASA engineers will be holding their breath when their spacecraft heads into Mars' atmosphere on Nov. 26."

# JPL Mars Space Images - Featured Image

In [8]:
JPL_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(JPL_url)
html = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
JPLSoup = bs(html, 'html.parser')
#print (webpageSoup)

In [9]:
main_image = JPLSoup.find('div', class_='carousel_items')
main_image

# get image title and its description
featured_image_title = bs.find(JPLSoup, 'h1', class_='media_feature_title').get_text()
featured_image_description = bs.find(JPLSoup, 'a', class_='button fancybox').get('data-description')
print (featured_image_title)
print (featured_image_description)

# make the url for the featured image
featured_image_url = bs.find(JPLSoup, 'a', class_='button fancybox').get('data-fancybox-href')
featured_image_filename = featured_image_url.split('/')[4].split('_')[0]
featured_image_url = f'https://www.jpl.nasa.gov/spaceimages/images/largesize/{featured_image_filename}_hires.jpg'
#print(featured_url)
#print (featured_filename)
print (featured_image_url)


				  Neptune - True Color of Clouds				
This image of the blue-hued Neptune was taken by NASA's Voyager 2; small trails of similar clouds trending east to west and large scale structure east of the Great Dark Spot all suggest that waves are present in the atmosphere and play a large role.
https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA00063_hires.jpg


# Mars Facts

In [10]:
twitter_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(twitter_url)
html = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
twitterSoup = bs(html, 'html.parser')
#print (webpageSoup)

In [11]:
tweet = 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text'

# extract text from most recent Martian weather tweet
recent_weather_tweet = bs.find_all(twitterSoup, 'p', class_= tweet)[0].get_text()
#print (recent_weather_tweet) 
# ^^^^^^^ this is all we need, everything below is to make data more organized

# split the string to break it into parts that can be added to dataframe
splitting_recent_weather = recent_weather_tweet.split(',')
#for i in splitting_recent_weather:
 #   splitting_recent_weather = i.split(' ')
#print (splitting_recent_weather)

#use list comprehension rather than for loop to create a list with the fields we need
splitting_recent_weather = [data.split(' ') for data in splitting_recent_weather][:]

# create dictionary to turn into a dataframe to make output neat and organized
weather_dictionary = {'Mars date': f'{recent_weather_tweet.split("(")[0]}',
                'Earth date': f'{recent_weather_tweet.split("(")[1].split(")")[0]}',
                'Current weather': f'{splitting_recent_weather[0][1]}',
                'High temperature': f'{splitting_recent_weather[1][2]}',
                'Low Temperature': f'{splitting_recent_weather[2][2]}',
                'Pressure': f'{splitting_recent_weather[3][3]} {splitting_recent_weather[3][4]}',
                'Daylight': f'{splitting_recent_weather[4][2]}'}
# got a ValueError: If using all scalar values, you must pass an index when doing:
# weather_df = pd.DataFrame.from_dict(weather_dict)
# using orient='index' solved the ValueError
weather_df = pd.DataFrame.from_dict(weather_dictionary, orient='index')
weather_df = weather_df.rename(columns={0:'Current Weather on Mars'})
weather_df
#print (recent_weather_tweet)
#print (splitting_recent_weather)
#print (weather_df)

Unnamed: 0,Current Weather on Mars
Mars date,Sol 2213
Earth date,2018-10-27
Current weather,2213
High temperature,-12C/10F
Low Temperature,-70C/-93F
Pressure,8.74 hPa
Daylight,06:11-18:29


# Mars Facts

In [12]:
mars_facts_url = 'https://space-facts.com/mars/'
browser.visit(mars_facts_url)
html = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
marsFactsSoup = bs(html, 'html.parser')
#print (webpageSoup)

In [13]:
# create a dictionary to hold the facts
facts_dictionary = {}

# get all the rows from facts and parse them into the dictionary
facts = bs.find(marsFactsSoup, 'table', class_='tablepress tablepress-id-mars').find_all('tr')
for fact in facts:
    facts_dictionary[bs.find(fact, 'strong').get_text()] = (bs.find(fact, class_='column-2').get_text())

# convert to Dataframe and to HTML table
facts_df = pd.DataFrame.from_dict(facts_dictionary, orient='index')
facts_df.rename(columns={0: 'Mars Fun Facts'}, inplace=True)
facts_html = pd.DataFrame.to_html(facts_df)

#print (facts_html)
facts_df

Unnamed: 0,Mars Fun Facts
Equatorial Diameter:,"6,792 km\n"
Polar Diameter:,"6,752 km\n"
Mass:,6.42 x 10^23 kg (10.7% Earth)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.52 AU)"
Orbit Period:,687 days (1.9 years)\n
Surface Temperature:,-153 to 20 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


# Mars Hemispheres from Astrogeology site

In [14]:
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)
html = browser.html
# Create BeautifulSoup object; parse with 'html.parser'
hemispheresSoup = bs(html, 'html.parser')
#print (webpageSoup)



# first find all the links to the photo pages
list_of_page_links = []
page_links = bs.find_all(hemispheresSoup, 'a', class_='itemLink product-item')
# used list comprehension again as opposed to a for loop for conciseness and readability
[list_of_page_links.append(page.get('href')) for page in page_links]
list_of_page_links = list(set(list_of_page_links))

images_list = []

# iterate through the list of page links and save the URL for full size images
for link in list_of_page_links:
    url = f'https://astrogeology.usgs.gov{link}' #using string literal allows us to us the same template for all items in the list
    browser.visit(url)
    html = browser.html
    # Create BeautifulSoup object; parse with 'html.parser'
    webpage = bs(html, 'html.parser')
    
    # get the title of the image
    title = bs.find(webpage, 'h2', class_='title').get_text()
    #print (title)
    
    # get the link for the full size image
    downloads_section = bs.find(webpage, 'div', class_='downloads')
    image_link = bs.find(downloads_section, 'a').get('href')
    #print (image_link)
    
    # add the image title and url for full size image to dictionary; not sure why, but 'title' and 'image_url' print in reverse order
    images_list.append({'title': title, 'image_url': image_link})
    #print (images_list)

#print ("**" * 10)

images_list

# attempted to put list items in a dataframe, but link didn't work for some reason out of the dataframe, not sure why :(
# images_df = pd.DataFrame(images_list, columns=['title', 'image_url'])
# images_df

[{'image_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
  'title': 'Schiaparelli Hemisphere Enhanced'},
 {'image_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
  'title': 'Syrtis Major Hemisphere Enhanced'},
 {'image_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
  'title': 'Cerberus Hemisphere Enhanced'},
 {'image_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg',
  'title': 'Valles Marineris Hemisphere Enhanced'}]