# Part 1 - Scraping

In [1]:
#Dependencies
from splinter import Browser
from splinter.exceptions import ElementDoesNotExist
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import time
import requests
import pprint 
from IPython.display import Markdown, display
import pymongo
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

### Mac Users

In [2]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

In [3]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Driver [/Users/Sally/.wdm/drivers/chromedriver/mac64/101.0.4951.41/chromedriver] found in cache


### NASA Mars News
Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

In [4]:
def mars_news(): 
    # connect to NASA Mars news Site
    url = 'https://redplanetscience.com/'

    # Retrieve page with the requests module
    browser.visit(url)

    response = requests.get(url)

    html=browser.html
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html, 'html.parser')

    # Examine the results, determine elements that contains sought info.
    #print(soup.prettify())

    article = soup.find("div", class_ = "list_text")
    news_title = article.find("div", class_="content_title").text
    news_p = article.find("div", class_="article_teaser_body").text

#     print(f'------------------------------------------------')
#     print(f'TITLE: {news_title}')
#     print(f'------------------------------------------------')
#     print(f'PARAGRAPH: {news_p}')
     
    return news_title, news_p

In [5]:
mars_news()

('Two of a Space Kind: Apollo 12 and Mars 2020',
 'Apollo 12 and the upcoming Mars 2020 mission may be separated by half a century, but they share several goals unique in the annals of space exploration.')

### JPL Mars Space Images - Featured Image

In [6]:
def featured_image():
    # Visit the url for JPL Featured Space Image
    # Set URL
    url = 'https://spaceimages-mars.com/'
    browser.visit(url)

    html=browser.html
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html, 'html.parser')

    # Use splinter to navigate the site and find the image url for the current Featured Mars Image 
    browser.find_by_css("a.showimg").first.click()
    time.sleep(2)

    #parse html page with BeautifulSoup
    html=browser.html
    # Create BeautifulSoup object; parse with 'html.parser'
    soup = BeautifulSoup(html, 'html.parser')

    # Need more info to find image url
    image_url = browser.find_by_css("img.fancybox-image")["src"]
    return image_url
    

In [7]:
featured_image()

'https://spaceimages-mars.com/image/featured/mars3.jpg'

### Mars Facts

In [8]:
def mars_facts():
    # Visit the Mars Facts webpage
    # Set URL
    url = 'https://galaxyfacts-mars.com/'
    browser.visit(url)

    # Use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc.
    mars_facts_df = pd.read_html("https://space-facts.com/mars/")[0]
    #print(mars_facts_df)

    # Clean up DataFrame, set index
    mars_facts_df.columns=["Planet Profile", "Value"]
    mars_facts_df.set_index("Planet Profile", inplace=True)
    mars_facts_html_table = mars_facts_df.to_html()
    mars_facts_html_table = mars_facts_html_table.replace('\n','')
    return mars_facts_html_table

In [9]:
mars_facts()

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Value</th>    </tr>    <tr>      <th>Planet Profile</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-87 to -5 °C</td>    </tr>    <tr>      <th>First Record:</th>      <td>2nd millennium BC</td>    </tr>    <tr>      <th>Recorded By:</th>      <td>Egyptian astronomers</td>    </tr>  </tbody></table>'

### Mars Hemispheres

In [10]:
def hemisphere_image_urls():
    # Visit the Astrogeology site
    # Set URL
    url = 'https://marshemispheres.com/'
    browser.visit(url)

    # Parse Results HTML with BeautifulSoup
    html = browser.html
    mars_weather_soup = BeautifulSoup(html, "html.parser")


    # Save both the image url string for the full resolution hemisphere image, 
    # and the Hemisphere title containing the hemisphere name
    soup1 = BeautifulSoup(html, "html.parser")
    items = soup1.find_all("div", class_="item")

    hemisphere_img_urls = []

    for item in items:

        title = item.find("h3").text
        link = item.find("a", class_="itemLink")["href"]
        hemispherelink = url + link
        browser.visit(hemispherelink)
        hemispherehtml = browser.html

        soup2 = BeautifulSoup(hemispherehtml, "lxml")
        image = soup2.find("img", class_="wide-image")["src"]
        imageurl = url + image
        hemisphere = {}

        hemisphere_img_urls.append({"title":title,"img_url":imageurl})

        browser.back()


    # Use a Python dictionary to store the data using the keys `img_url` and `title`
    return hemisphere_img_urls

In [11]:
hemisphere_image_urls()

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/f5e372a36edfa389625da6d0cc25d905_cerberus_enhanced.tif_full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/3778f7b43bbbc89d6e3cfabb3613ba93_schiaparelli_enhanced.tif_full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/555e6403a6ddd7ba16ddb0e471cadcf7_syrtis_major_enhanced.tif_full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/b3c7c6c9138f57b4756be9b9c43e3a48_valles_marineris_enhanced.tif_full.jpg'}]

#### Scrape code from above and return one Python dictionary containing all scraped data

In [12]:
# Scrape All
def scrape_all():
    # Initiate headless driver for deployment
#     executable_path = {"executable_path": "chromedriver"}
    browser = Browser("chrome", **executable_path, headless=False)
    news_title, news_p = mars_news()
#     image_url = featured_image()
#     mars_facts_df = mars_facts()
#     hemisphere_img_urls = hemisphere_img_urls()

     # Run all scraping functions and store results in a dictionary
    mars_data = {
        "news_title": news_title,
         "news_p": news_p,
        "featured_image": featured_image(),
        "mars_facts": mars_facts(),
        "hemispheres": hemisphere_image_urls()}
    
#     browser.quit()
    
    return mars_data


In [13]:
mars_data = scrape_all() 

In [14]:
mars_data

{'news_title': "Deadline Closing for Names to Fly on NASA's Next Mars Rover",
 'news_p': 'You have until Sept. 30 to send your names to Mars aboard the Mars 2020 rover. ',
 'featured_image': 'https://spaceimages-mars.com/image/featured/mars1.jpg',
 'mars_facts': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Value</th>    </tr>    <tr>      <th>Planet Profile</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surface Temperature:</th>      <td>-87 to -5 °

In [15]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [16]:
# Define database and collection
db = client.mars
collection = db.mars

In [17]:
# Dictionary to be inserted as a MongoDB document
collection.insert_one(mars_data)

<pymongo.results.InsertOneResult at 0x7f87f28ff880>

In [18]:
# Display items in MongoDB collection
listings = db.mars.find()

for listing in listings:
    print(listing)

{'_id': ObjectId('6288feb95f406582c26ce489'), 'news_title': "Deadline Closing for Names to Fly on NASA's Next Mars Rover", 'news_p': 'You have until Sept. 30 to send your names to Mars aboard the Mars 2020 rover. ', 'featured_image': 'https://spaceimages-mars.com/image/featured/mars1.jpg', 'mars_facts': '<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Value</th>    </tr>    <tr>      <th>Planet Profile</th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Equatorial Diameter:</th>      <td>6,792 km</td>    </tr>    <tr>      <th>Polar Diameter:</th>      <td>6,752 km</td>    </tr>    <tr>      <th>Mass:</th>      <td>6.39 × 10^23 kg (0.11 Earths)</td>    </tr>    <tr>      <th>Moons:</th>      <td>2 (Phobos &amp; Deimos)</td>    </tr>    <tr>      <th>Orbit Distance:</th>      <td>227,943,824 km (1.38 AU)</td>    </tr>    <tr>      <th>Orbit Period:</th>      <td>687 days (1.9 years)</td>    </tr>    <tr>      <th>Surf