In [2]:
# Dependencies
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
import pandas as pd

# Step 1 - Scraping
### NASA Mars News
##### Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

In [3]:
# Browser setup - keep window open
executable_path = {"executable_path": "chromedriver.exe"}
browser = Browser("chrome", **executable_path, headless=False)

In [4]:
# Define and retrieve the page
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
html = browser.html
soup = bs(html, "html.parser")

In [5]:
#Scrape to get title and text
news_title = soup.find("li", class_="slide").find("div", class_="content_title").text
print(news_title)
news_p = soup.find("li", class_="slide").find("div", class_="article_teaser_body").text
print(news_p)

NASA Readies Perseverance Mars Rover's Earthly Twin 
Did you know NASA's next Mars rover has a nearly identical sibling on Earth for testing? Even better, it's about to roll for the first time through a replica Martian landscape.


### JPL Mars Space Images - Featured Image
##### Use plinter to navigate the JPL Featured Space Image site and find the image url for the current Featured Mars Image. Assign the complete url string for the full size image to a variable.

In [46]:
# Define and retrieve the page
# Rerun Browser setup cell (2) again before this one to initialize chromedriver
image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(image_url)

In [47]:
# Navigate site and save full size url string of image to variable

# Click "Full Image" button on page
browser.click_link_by_partial_text("FULL IMAGE")
# Click "More Info" button to get to full size image
browser.click_link_by_partial_text("more info")

html = browser.html
soup = bs(html, "html.parser")

# Scrape for full image 
base_url = "https://www.jpl.nasa.gov"
img_url = soup.find('figure', class_='lede').find('a').find('img')['src']

# Add image url to base url
featured_image_url = base_url + img_url
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA18358_hires.jpg'

### Mars Facts
##### Visit the Mars Facts webpage and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. Convert this data to a HTML table string.

In [64]:
# Have pandas read any tables on mars facts page
facts_url = 'https://space-facts.com/mars/'

fact_table = pd.read_html(facts_url)
fact_table

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers,
   Mars - Earth Comparison             Mars            Earth
 0               Diameter:         6,779 km        12,742 km
 1                   Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 2                  Moons:                2                1
 3      Distance from Sun:   227,943,824 km   149,598,262 km
 4         Length of Year:   687 Earth days      365.24 days
 5            Temperature:     -87 to -5 °C      -88 to 58°C,
           

In [65]:
# Filter to table I want to work with
fact_df = fact_table[0]

# Rename columns
fact_df.columns = ["Description", "Mars"]

# Remove Index/set new
facts = fact_df.set_index("Description")
facts

Unnamed: 0_level_0,Mars
Description,Unnamed: 1_level_1
Equatorial Diameter:,"6,792 km"
Polar Diameter:,"6,752 km"
Mass:,6.39 × 10^23 kg (0.11 Earths)
Moons:,2 (Phobos & Deimos)
Orbit Distance:,"227,943,824 km (1.38 AU)"
Orbit Period:,687 days (1.9 years)
Surface Temperature:,-87 to -5 °C
First Record:,2nd millennium BC
Recorded By:,Egyptian astronomers


In [75]:
# Convert to html string & clean
html_facts = facts.to_html()
html_facts = html_facts.replace('\n', '')
html_facts

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Mars</th>    </tr>    <tr>      <th>Description</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

### Mars Hemispheres
##### Visit the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres. Click each of the links to the hemispheres in order to find the image url to the full resolution image. Save both the image url string for the full resolution hemisphere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title. Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [83]:
# Define and retrieve the page
# Rerun Browser setup cell (2) again before this one to initialize chromedriver
hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

browser.visit(hemi_url)
html = browser.html
soup = bs(html, "html.parser")

In [90]:
# Blank list to contain the dictionaries
hemisphere_image_urls = []

# Base image url
baseimg_url="https://astrogeology.usgs.gov/"

# Soup object
hemis = soup.find_all('div', class_='item')

# Loop to get each title & url
for hemi in hemis:
    title = hemi.find('h3').text
    
    browser.click_link_by_partial_text("Hemisphere Enhanced")
    img_html = browser.html
    img_soup = bs(img_html, "html.parser")
    imgs_url = img_soup.find("img", class_="wide-image")["src"]
    
    image_url = baseimg_url+imgs_url
    hemisphere_image_urls.append({"title": title, "img_url": image_url})

# Display final dict
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov//cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov//cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov//cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astrogeology.usgs.gov//cache/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

# Step 2 - MongoDB and Flask Application
##### Use MongoDB with Flask templating to create a new HTML page that displays all of the information that was scraped from the URLs above. Convert your Jupyter notebook into a Python script called scrape_mars.py with a function called scrape that will execute all of your scraping code from above and return one Python dictionary containing all of the scraped data. - see separate file