In [1]:
#!pip install splinter

In [1]:
import pandas as pd
import json
from splinter import Browser
from bs4 import BeautifulSoup as bs

In [2]:
# Create the exe path for chrome to open chrome page and will open a chrome window
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)

### NASA Mars News

In [3]:
# URL of NASA Mars site, News page to be scraped
mars_url = 'https://mars.nasa.gov/news/'
browser.visit(mars_url)

# Get HTML page with the browser; create BeautifulSoup object; parse with 'html.parser'
mars_html = browser.html
soup = bs(mars_html, 'html.parser')
#print(soup.prettify())

In [4]:
# Collect the latest News Title and Paragraph Text.
# Assign the text to variables to be referenced later.
news_title = soup.find('li',class_="slide").find('div', class_='content_title').text
news_p = soup.find('div', class_='article_teaser_body').text

print (f'Title: {news_title}')
print (f'Paragraph: {news_p}')

Title: A Martian Roundtrip: NASA's Perseverance Rover Sample Tubes
Paragraph: Marvels of engineering, the rover's sample tubes must be tough enough to safely bring Red Planet samples on the long journey back to Earth in immaculate condition. 


### JPL Mars Space Images - Featured Image

In [5]:
# URL of NASA Mars site, JPL Featured Space Images page to be scraped
base_url = 'https://www.jpl.nasa.gov'
mar_space_images_url = base_url + '/spaceimages/?search=&category=Mars'
browser.visit(mar_space_images_url)

# Find and Click "FULL IMAGE" button
full_image_data = browser.find_by_id('full_image')
full_image_data.click()

# Find and Click 'more info' button, wait a second
more_info_data = browser.links.find_by_partial_text('more info')
more_info_data.click()

# Find the actual website path we are going to scrape and read/show the data using BeautifulSoup
html = browser.html
soup = bs(html, 'html.parser')
#print(soup.prettify())

# Find featured_image_url large size
try:    
    featured_image_url = soup.find('figure', class_='lede').a['href']
    full_image_url = base_url + featured_image_url
    print(f"Large picture url is: {full_image_url}") 
    
except AttributeError as e:
    print(e)

Large picture url is: https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA17009_hires.jpg


### Mars Facts

#### To scrape the table containing facts about the planet

In [7]:
# Visit the url to be scraped
url = 'http://space-facts.com/mars/'
tables = pd.read_html(url)
facts_df = pd.DataFrame(tables[0])
facts_df.columns = ["Fact","Fact Value"]
facts_df['Fact'] = facts_df['Fact'].str.replace(':', '')
facts_df

Unnamed: 0,Fact,Fact Value
0,Equatorial Diameter,"6,792 km"
1,Polar Diameter,"6,752 km"
2,Mass,6.39 × 10^23 kg (0.11 Earths)
3,Moons,2 (Phobos & Deimos)
4,Orbit Distance,"227,943,824 km (1.38 AU)"
5,Orbit Period,687 days (1.9 years)
6,Surface Temperature,-87 to -5 °C
7,First Record,2nd millennium BC
8,Recorded By,Egyptian astronomers


In [8]:
# Convert the data to a HTML table string
facts_html = facts_df.to_html(index =False, header =True)
print(facts_html)

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>Fact</th>
      <th>Fact Value</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Equatorial Diameter</td>
      <td>6,792 km</td>
    </tr>
    <tr>
      <td>Polar Diameter</td>
      <td>6,752 km</td>
    </tr>
    <tr>
      <td>Mass</td>
      <td>6.39 × 10^23 kg (0.11 Earths)</td>
    </tr>
    <tr>
      <td>Moons</td>
      <td>2 (Phobos &amp; Deimos)</td>
    </tr>
    <tr>
      <td>Orbit Distance</td>
      <td>227,943,824 km (1.38 AU)</td>
    </tr>
    <tr>
      <td>Orbit Period</td>
      <td>687 days (1.9 years)</td>
    </tr>
    <tr>
      <td>Surface Temperature</td>
      <td>-87 to -5 °C</td>
    </tr>
    <tr>
      <td>First Record</td>
      <td>2nd millennium BC</td>
    </tr>
    <tr>
      <td>Recorded By</td>
      <td>Egyptian astronomers</td>
    </tr>
  </tbody>
</table>


### Mars Hemispheres

#### To obtain high resolution images for each of Mar's hemispheres

In [9]:
# Find the actual website path we are going to scrape and visit the url to be scraped
base_url = 'https://astrogeology.usgs.gov'
url = base_url + '/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

# Read/show the data using BeautifulSoup
html = browser.html
soup = bs(html,'html')
#print(soup.prettify())

In [10]:
# Extracted the titles and the image browser url to extract the full resolution image
image_urls = [(a.text, a['href']) for a in browser.find_by_css('div[class="description"] a')]
#print(image_urls)

hemisphere_image_urls = []

for title,url in image_urls:
    # Used a Python dictionary to store the data
    temp = {}
    temp['title'] = title
    
    # Visit the url to look for image url
    browser.visit(url)
    img_url = browser.find_by_css('img[class="wide-image"]')['src']
    temp['img_url'] = img_url
    
    # Appended each hemisphere info of all hemipheres to the list
    hemisphere_image_urls.append(temp)

# Displayed titles and image links
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov/cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

In [11]:
# Exit the browser for clean re-run
browser.quit()