## Web Scraping - Mission to Mars
#### Dependencies and Setup

In [None]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time


def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

## NASA Mars Latest News 
#### Browser and Site Connection

In [185]:
browser = init_browser()

# Visit mars.nasa.com
url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
browser.visit(url)
time.sleep(1)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

#### Understanding and Inspecting the Data

In [186]:
def news_list(class_name):
    soup_list = soup.find_all(class_=class_name)
    news_list = []
    [news_list.append(news.text.replace("\n", "")) for news in soup_list]
    return news_list

In [187]:
page_title = soup.title.text
page_title

'News  – NASA’s Mars Exploration Program '

In [188]:
# soup.prettify()

In [189]:
for link in soup.find_all("a")[:4]:
    print("Inner Text: {}".format(link.text))
    print("Title: {}".format(link.get("title")))
    print("href: {}".format(link.get("href")))

Inner Text: NASA
Title: visit nasa.gov
href: http://www.nasa.gov
Inner Text: NASA Science
Title: Explore NASA Science
href: https://science.nasa.gov/
Inner Text: Mars Exploration Program
Title: Mars
href: /mars-exploration/#
Inner Text: Skip Navigation
Title: None
href: #page


#### Collecting the Data

In [190]:
# Collect the latest News Titles
date_news = news_list("list_date")
len(date_news)

40

In [191]:
# Collect the latest News Titles
title_news = news_list("content_title")
title_news.pop(0)
title_news[:10]

["NASA Engineers Checking InSight's Weather Sensors",
 "Follow NASA's Perseverance Rover in Real Time on Its Way to Mars",
 'NASA Establishes Board to Initially Review Mars Sample Return Plans',
 "NASA's Ingenuity Mars Helicopter Recharges Its Batteries in Flight",
 "Celebrate Mars Reconnaissance Orbiter's Views From Above",
 "NASA's MAVEN Observes Martian Night Sky Pulsing in Ultraviolet Light",
 "8 Martian Postcards to Celebrate Curiosity's Landing Anniversary",
 'NASA, ULA Launch Mars 2020 Perseverance Rover Mission to Red Planet',
 "NASA's Perseverance Rover Will Carry First Spacesuit Materials to Mars",
 "A New Video Captures the Science of NASA's Perseverance Mars Rover"]

In [192]:
# Collect the latest News Titles
par_news = news_list("article_teaser_body")
len(par_news)

40

In [197]:
url_image_news = soup.find_all(class_="list_image")
url_image_news[:10]

[<div class="list_image"><img alt="InSight Collecting Mars Weather Data" src="/system/news_items/list_view_images/8744_PIA22957-226.jpg"/></div>,
 <div class="list_image"><img alt="Illustration of Mars 2020 spacecraft" src="/system/news_items/list_view_images/8742_Mars2020-Earth-226.jpg"/></div>,
 <div class="list_image"><img alt="Mars 2020 With Sample Tubes (Artist's Concept)" src="/system/news_items/list_view_images/8737_24760_PIA23492-320x240.jpg"/></div>,
 <div class="list_image"><img alt="Mars 2020 Perseverance rover and the Ingenuity Mars Helicopter" src="/system/news_items/list_view_images/8736_PIA24043-Rover-and-Helicopter-320x240.jpg"/></div>,
 <div class="list_image"><img alt="Side-by-side movies shows how the 2018 global dust storm enveloped Mars" src="/system/news_items/list_view_images/8735_collage-320.jpg"/></div>,
 <div class="list_image"><img alt="ultraviolet “nightglow” in the Martian atmosphere" src="/system/news_items/list_view_images/8731_MAVEN_nightglow_pulsating_s

In [196]:
# Collect the Featured Images URL
# url_image_news = soup.find_all(class_="list_image")
url_image_news = soup.find_all("img")
images_names = []
notlist = ['More', 'expand arrow', '', 'twitter', 'facebook', 'instagram', 'rss']
for image in url_image_news:
    row = {}
    name = image["alt"]
    if name not in notlist:
        row['Image_name'] = name
        row['Image_url'] = url + image["src"]
        images_names.append(row)
images_names[:10]

[{'Image_name': 'Mars 2020 Perseverance Rover',
  'Image_url': 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest/system/missions/list_view_images/23_PIA23764-RoverNamePlateonMars-320x240.jpg'},
 {'Image_name': 'Curiosity Rover',
  'Image_url': 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest/system/missions/list_view_images/2_PIA14175-thmfeat.jpg'},
 {'Image_name': 'InSight Lander',
  'Image_url': 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest/system/missions/list_view_images/21_PIA22743-320x240.jpg'},
 {'Image_name': 'MAVEN',
  'Image_url': 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest/system/mis

In [101]:
news = []
for i in range(len(date_news)):
    row = {}
    row['Date'] = date_news[i]
    row["News_Title"] = title_news[i]
    row["News_Paragraph"] = par_news[i]
    row["Image_URL"] = url_image_news[i]
    news.append(row)
news[:5]

[{'Date': 'August 24, 2020',
  'News_Title': "NASA Engineers Checking InSight's Weather Sensors",
  'News_Paragraph': 'An electronics issue is suspected to be preventing the sensors from sharing their data about Mars weather with the spacecraft.',
  'Image_URL': ''},
 {'Date': 'August 21, 2020',
  'News_Title': "Follow NASA's Perseverance Rover in Real Time on Its Way to Mars",
  'News_Paragraph': "A crisply rendered web application can show you where the agency's Mars 2020 mission is right now as it makes its way to the Red Planet for a Feb. 18, 2021, landing.",
  'Image_URL': ''},
 {'Date': 'August 14, 2020',
  'News_Title': 'NASA Establishes Board to Initially Review Mars Sample Return Plans',
  'News_Paragraph': 'The board will assist with analysis of current plans and goals for one of the most difficult missions humanity has ever undertaken.',
  'Image_URL': ''},
 {'Date': 'August 13, 2020',
  'News_Title': "NASA's Ingenuity Mars Helicopter Recharges Its Batteries in Flight",
  'N

In [116]:
 # Close the browser after scraping
browser.quit()

## JPL Mars Space Images - Featured Image
#### Browser and Site Connection

In [132]:
browser = init_browser()

# Visit mars.nasa.com
url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url_image)
time.sleep(1)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

#### Collecting the Data

In [133]:
# Collect the Featured Images URL
url_image_news = soup.find_all('img')
images_list = []
notlist = ['more arrow', '']
for image in url_image_news:
        row = {}
        name = image["alt"]
        if name not in notlist:
            row['Image_name'] = name
            row['Image_url'] = url_image + image["src"]
            images_list.append(row)
images_list

[{'Image_name': 'Wind and Sand - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24077-640x350.jpg'},
 {'Image_name': 'Claritas Fossae - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24076-640x350.jpg'},
 {'Image_name': 'Virrat Crater - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24075-640x350.jpg'},
 {'Image_name': 'Claritas Fossae - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24074-640x350.jpg'},
 {'Image_name': 'Angustus Labyrinthus - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24073-640x350.jpg'},
 {'Image_name': 'Terra Cimmeria Crater - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?

In [198]:
 # Close the browser after scraping
browser.quit()

## Mars Facts
#### Browser and Site Connection

In [199]:
browser = init_browser()

# Visit mars.nasa.com
url_image = "https://space-facts.com/mars/"
browser.visit(url_image)
time.sleep(1)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

#### Collecting the Data

In [205]:
facts = news_list("column-1")
facts[:10]

['Equatorial Diameter:',
 'Polar Diameter:',
 'Mass:',
 'Moons:',
 'Orbit Distance:',
 'Orbit Period:',
 'Surface Temperature: ',
 'First Record:',
 'Recorded By:',
 'Mars - Earth Comparison']

In [206]:
facts_data = news_list("column-2")
facts_data[:10]

['6,792 km',
 '6,752 km',
 '6.39 × 10^23 kg (0.11 Earths)',
 '2 (Phobos & Deimos)',
 '227,943,824 km (1.38 AU)',
 '687 days (1.9 years)',
 '-87 to -5 °C',
 '2nd millennium BC',
 'Egyptian astronomers',
 'Mars']

In [208]:
facts_list = []
for i in range(len(facts_data)):
        row = {}
        row['Description'] = facts[i]
        row['Mars_Fact'] = facts_data[i]
        facts_list.append(row)
facts_list[:5]

[{'Description': 'Equatorial Diameter:', 'Mars_Fact': '6,792 km'},
 {'Description': 'Polar Diameter:', 'Mars_Fact': '6,752 km'},
 {'Description': 'Mass:', 'Mars_Fact': '6.39 × 10^23 kg (0.11 Earths)'},
 {'Description': 'Moons:', 'Mars_Fact': '2 (Phobos & Deimos)'},
 {'Description': 'Orbit Distance:', 'Mars_Fact': '227,943,824 km (1.38 AU)'}]

In [265]:
# Close the browser after scraping
browser.quit()

In [None]:
## Mars Hemispheres
#### Browser and Site Connection

In [210]:
browser = init_browser()

# Visit mars.nasa.com
url_astro = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url_astro)
time.sleep(1)

# Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

In [259]:
soup.find_all(class_="description")[:5]

[<div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/cerberus_enhanced"><h3>Cerberus Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p></div>,
 <div class="description"><a class="itemLink product-item" href="/search/map/Mars/Viking/schiaparelli_enhanced"><h3>Schiaparelli Hemisphere Enhanced</h3></a><span class="subtitle" style="float:left">image/tiff 35 MB</span><span class="pubDate" style="float:right"></span><br/><p>Mosaic of the Schiaparelli hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. The images were acquired in 1980 during early northern…</p></div>,
 <div class="description"><a 

In [264]:
soup.findAll('h3')[0].next

'Cerberus Hemisphere Enhanced'

In [263]:
hem_names = []
[hem_names.append(name.next) for name in soup.findAll('h3')]

[None, None, None, None]