## Web Scraping - Mission to Mars
#### Dependencies and Setup

In [417]:
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time


def init_browser():
    # @NOTE: Replace the path with your actual path to the chromedriver
    executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
    return Browser("chrome", **executable_path, headless=False)

def soup_url (url):
    browser = init_browser()
    # Visit url
    browser.visit(url)
    time.sleep(1)

    # Scrape page into Soup
    html = browser.html
    soup_url = bs(html, "html.parser")
    
     # Close the browser after scraping
    browser.quit()
    
    return soup_url

def soup_class (soup, class_name):
    
    # Soup data list
    soup_class = soup.find(class_= class_name)
    
    return soup_class 

def soup_class_list (soup, class_name):
   
    # Soup data list
    soup_class_list = soup.find_all(class_= class_name)
    
    return soup_class_list

def text_list (soup_class_list):
    text_list = []
    [text_list.append(record.text.replace("\n", "")) for record in soup_class_list]
    return text_list

## NASA Mars Latest News 
#### Browser and Site Connection

In [418]:
# Visit mars.nasa.gov
mars_url = "https://mars.nasa.gov"
url_news = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
soup = soup_url(url_news)
# soup.prettify()

#### Understanding and Inspecting the Data

In [419]:
page_title = soup.title.text
page_title

'News  – NASA’s Mars Exploration Program '

In [420]:
for link in soup.find_all("a")[:4]:
    print("Inner Text: {}".format(link.text))
    print("Title: {}".format(link.get("title")))
    print("href: {}".format(link.get("href")))

Inner Text: NASA
Title: visit nasa.gov
href: http://www.nasa.gov
Inner Text: NASA Science
Title: Explore NASA Science
href: https://science.nasa.gov/
Inner Text: Mars Exploration Program
Title: Mars
href: /mars-exploration/#
Inner Text: Skip Navigation
Title: None
href: #page


#### Collecting the Data

In [421]:
# Collect the latest News Dates
date_news = text_list(soup_class_list(soup, "list_date"))
date_news[:5]

['August 24, 2020',
 'August 21, 2020',
 'August 14, 2020',
 'August 13, 2020',
 'August 12, 2020']

In [422]:
# Collect the latest News Titles
title_news = text_list(soup_class_list(soup, "content_title"))
title_news.pop(0)
title_news[:5]

["NASA Engineers Checking InSight's Weather Sensors",
 "Follow NASA's Perseverance Rover in Real Time on Its Way to Mars",
 'NASA Establishes Board to Initially Review Mars Sample Return Plans',
 "NASA's Ingenuity Mars Helicopter Recharges Its Batteries in Flight",
 "Celebrate Mars Reconnaissance Orbiter's Views From Above"]

In [423]:
# Collect the latest News Paragraphs
par_news = text_list(soup_class_list(soup, "article_teaser_body"))
par_news[:5]

['An electronics issue is suspected to be preventing the sensors from sharing their data about Mars weather with the spacecraft.',
 "A crisply rendered web application can show you where the agency's Mars 2020 mission is right now as it makes its way to the Red Planet for a Feb. 18, 2021, landing.",
 'The board will assist with analysis of current plans and goals for one of the most difficult missions humanity has ever undertaken.',
 'Headed to the Red Planet with the Perseverance rover, the pioneering helicopter is powered up for the first time in interplanetary space as part of a systems check.',
 'Marking its 15th anniversary since launch, one of the oldest spacecraft at the Red Planet has provided glimpses of dust devils, avalanches, and more.']

In [424]:
image_news = soup_class_list(soup, "list_image")
image_name_list = []
image_url_list = []
for image in image_news:
    image_name_list.append(image("img")[0].get('alt'))
    image_url_list.append(mars_url + image("img")[0].get('src'))

In [425]:
len(image_name_list)

40

In [426]:
news = []
for i in range(len(date_news)):
    row = {}
    row['Date'] = date_news[i]
    row["News_Title"] = title_news[i]
    row["News_Paragraph"] = par_news[i]
    row["Image_Name"] = image_name_list[i]
    row["Image_URL"] = image_url_list[i]
    news.append(row)
news[:5]

[{'Date': 'August 24, 2020',
  'News_Title': "NASA Engineers Checking InSight's Weather Sensors",
  'News_Paragraph': 'An electronics issue is suspected to be preventing the sensors from sharing their data about Mars weather with the spacecraft.',
  'Image_Name': 'InSight Collecting Mars Weather Data',
  'Image_URL': 'https://mars.nasa.gov/system/news_items/list_view_images/8744_PIA22957-226.jpg'},
 {'Date': 'August 21, 2020',
  'News_Title': "Follow NASA's Perseverance Rover in Real Time on Its Way to Mars",
  'News_Paragraph': "A crisply rendered web application can show you where the agency's Mars 2020 mission is right now as it makes its way to the Red Planet for a Feb. 18, 2021, landing.",
  'Image_Name': 'Illustration of Mars 2020 spacecraft',
  'Image_URL': 'https://mars.nasa.gov/system/news_items/list_view_images/8742_Mars2020-Earth-226.jpg'},
 {'Date': 'August 14, 2020',
  'News_Title': 'NASA Establishes Board to Initially Review Mars Sample Return Plans',
  'News_Paragraph': 

## JPL Mars Space Images - Featured Image
#### Browser and Site Connection

In [465]:
# Visit jpl.nasa.gov
url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
url = "https://www.jpl.nasa.gov"
soup = soup_url(url_image)

In [505]:
image = soup.find(class_="carousel_item")
print(image["style"].split("'")[1])

/spaceimages/images/wallpaper/PIA14872-1920x1200.jpg


In [481]:
browser = init_browser()
    # Visit url
browser.visit(url_image)
time.sleep(1)

    # Scrape page into Soup
html = browser.html
soup = bs(html, "html.parser")

#### Collecting the Data

In [133]:
# Collect the Featured Images URL
url_image_news = soup.find_all('img')
images_list = []
notlist = ['more arrow', '']
for image in url_image_news:
        row = {}
        name = image["alt"]
        if name not in notlist:
            row['Image_name'] = name
            row['Image_url'] = url_image + image["src"]
            images_list.append(row)
images_list

[{'Image_name': 'Wind and Sand - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24077-640x350.jpg'},
 {'Image_name': 'Claritas Fossae - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24076-640x350.jpg'},
 {'Image_name': 'Virrat Crater - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24075-640x350.jpg'},
 {'Image_name': 'Claritas Fossae - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24074-640x350.jpg'},
 {'Image_name': 'Angustus Labyrinthus - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/wallpaper/PIA24073-640x350.jpg'},
 {'Image_name': 'Terra Cimmeria Crater - False Color',
  'Image_url': 'https://www.jpl.nasa.gov/spaceimages/?

## Mars Facts
#### Browser and Site Connection

In [427]:
# Visit space-facts.com/mars
url_facts = "https://space-facts.com/mars/"
soup = soup_url(url_facts)

#### Collecting the Data

In [428]:
# Extracting facts description
facts = text_list(soup_class_list(soup, "column-1"))
facts[:10]

['Equatorial Diameter:',
 'Polar Diameter:',
 'Mass:',
 'Moons:',
 'Orbit Distance:',
 'Orbit Period:',
 'Surface Temperature: ',
 'First Record:',
 'Recorded By:',
 'Mars - Earth Comparison']

In [429]:
# Extracting facts data
facts_data = text_list(soup_class_list(soup, "column-2"))
facts_data[:10]

['6,792 km',
 '6,752 km',
 '6.39 × 10^23 kg (0.11 Earths)',
 '2 (Phobos & Deimos)',
 '227,943,824 km (1.38 AU)',
 '687 days (1.9 years)',
 '-87 to -5 °C',
 '2nd millennium BC',
 'Egyptian astronomers',
 'Mars']

In [430]:
facts_list = []
for i in range(len(facts_data)):
        row = {}
        row['Description'] = facts[i]
        row['Mars_Fact'] = facts_data[i]
        facts_list.append(row)
facts_list[:5]

[{'Description': 'Equatorial Diameter:', 'Mars_Fact': '6,792 km'},
 {'Description': 'Polar Diameter:', 'Mars_Fact': '6,752 km'},
 {'Description': 'Mass:', 'Mars_Fact': '6.39 × 10^23 kg (0.11 Earths)'},
 {'Description': 'Moons:', 'Mars_Fact': '2 (Phobos & Deimos)'},
 {'Description': 'Orbit Distance:', 'Mars_Fact': '227,943,824 km (1.38 AU)'}]

## Mars Hemispheres
#### Browser and Site Connection

In [431]:
# Visit mars.nasa.com
url_astro = "https://astrogeology.usgs.gov"
url_astro_search = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
soup = soup_url(url_astro_search)

In [461]:
# Extract Reference per each product item
items_astro = soup_class_list(soup, 'itemLink product-item')
items_ref_list = []
for item in items:
    item_url = url_astro + item.get('href')
    if item_url not in items_ref_list:
        items_ref_list.append(item_url)
items_ref_list

['https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced',
 'https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced']

In [463]:
items_title_list = [soup.findAll('h3')[i].next for i in range(len(soup.findAll('h3')))]
items_title_list

['Cerberus Hemisphere Enhanced',
 'Schiaparelli Hemisphere Enhanced',
 'Syrtis Major Hemisphere Enhanced',
 'Valles Marineris Hemisphere Enhanced']

In [464]:
# Create list of data dictionaries for each product-item
hemisphere_list = []
for i in range(len(items_title_list)):
    row ={}
    soup = soup_url(items_ref_list[i])
    downloads = soup.find(class_='downloads')
    item_url = downloads.findChildren("a")[0].get("href")
    row['title'] = items_title_list[i]
    row['img_url'] = item_url
    hemisphere_list.append(row)
hemisphere_list

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]