# Web Scraping Homework - Mission to Mars
## Terrence Cummings

In [132]:
import pandas as pd
from bs4 import BeautifulSoup
import json
import requests
import pymongo
from splinter import Browser
from selenium import webdriver
import time
import sys
import os

# Step 1 - Scraping

The outputs of 5 scraping exercises are:

1. Dataframe containing the date, title, and summary of the latest news articles about mars from the NASA Mars News site.

2. A URL for the full-sized featured image at the NASA Jet Propulsion Laboratory website.

3. Text of the most recent weather posting from the Mars Weather Twitter featured

4. An HTML table of key facts about Mars

5. A list of dictionaries with the name and a link to a full-size image of each of Mars' four hemispheres.



## NASA Mars New

Retrieve the following data from a search of the latest news articles at the NASA Mars News website URL:

1. Publication Date

2. Title

3. Summary paragraph


In [133]:
#Because the search results at the URL are from Javascript use Selenium to scrape the data

#URL for NASA Mars News website. This show 40 articles from a search of the criteria "Latest" and "All Categories".
#Results of the search are generated by Javascript so not viewable in the webpage HTML
url_mars_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

#Initialize lists to store Selenium objects
dates = []
titles = []
summarys = []

#Use Selenium to get the needed fields from the JS results
#XPath for tags were found by right-clicking on the tag in the Chrome Inspector tool the Copy XPath
driver = webdriver.Chrome()
driver.get(url_mars_news)

#Add a delay to give the scraper time to acquire the data
time.sleep(10)
dates = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[1]')
titles = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[2]/a')
summarys = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[3]')

# create empty array to store text data extracted from Selenium objects
date_lst = []
title_lst = []
summary_lst = []

# loop over results and extract text from Selenium objects, add to each list
for date in dates:
    article_date = date.text
    date_lst.append(article_date)
for title in titles:
    article_title = title.text
    title_lst.append(article_title)
for summary in summarys:
    article_summary = summary.text
    summary_lst.append(article_summary)

#Make dataframe of NASA Mars Latest News Articles
nasa_mars_articles_df = pd.DataFrame(list(zip(date_lst, title_lst, summary_lst)), columns =['Date', 'Title', 'Summary'])

driver.quit()

In [134]:
#Confirm results of the scraping
nasa_mars_articles_df

Unnamed: 0,Date,Title,Summary
0,"May 21, 2020",Air Deliveries Bring NASA's Perseverance Mars ...,A NASA Wallops Flight Facility cargo plane tra...
1,"May 19, 2020","NASA Wins 4 Webbys, 4 People's Voice Awards","Winners include the JPL-managed ""Send Your Nam..."
2,"May 18, 2020",NASA's Perseverance Rover Goes Through Trials ...,The agency's new Mars rover is put through a s...
3,"May 7, 2020",NASA's Perseverance Rover Mission Getting in S...,Stacking spacecraft components on top of each ...
4,"May 6, 2020",NASA Perseverance Mars Rover Scientists Train ...,Team members searched for signs of ancient mic...
5,"May 1, 2020",NASA's Perseverance Rover Will Look at Mars Th...,A pair of zoomable cameras will help scientist...
6,"April 30, 2020",Meet the People Behind NASA's Perseverance Rover,These are the scientists and engineers who bui...
7,"April 29, 2020","Q&A with the Student Who Named Ingenuity, NASA...","As a longtime fan of space exploration, Vaneez..."
8,"April 29, 2020",Alabama High School Student Names NASA's Mars ...,Vaneeza Rupani's essay was chosen as the name ...
9,"April 21, 2020",How NASA's Perseverance Mars Team Adjusted to ...,"Like much of the rest of the world, the Mars r..."


## JPL Mars Space Images - Featured Images
Get the URL for the featured image at the website

In [135]:
#Setup Splinter Browsder and target URL
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

#Go to URL and navigate to page with full size image.
browser.visit(url_jpl)
browser.click_link_by_partial_text('FULL IMAGE')
browser.click_link_by_partial_text('more info')

#Grab the HTM from the webpage with the full size image which contains the link to that image
html = browser.html
browser.quit()


In [136]:
#Use BeautifulSoup to parse the HTML
soup = BeautifulSoup(html, 'html.parser')

#Find the image tag for the main image
main_img = soup.find('img', class_='main_image')

#Extract the source link for the image
main_img_url = main_img['src']

#Build the full URL to the full size featured image
main_img_url_full = 'https://www.jpl.nasa.gov'+main_img_url

#Check result
main_img_url_full

'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA23170_hires.jpg'

## Mars Weather
Get the latest Mars Weather tweet from Twitter

In [137]:
#Use Selenium because Twitter tweets are populated by JS
url_mars_tweet = 'https://twitter.com/marswxreport?lang=en'
driver = webdriver.Chrome()
driver.get(url_mars_tweet)
time.sleep(1)

#Get the first Mars weather tweet using Xpath in Selenium
mars_weather_tweet_obj = driver.find_elements_by_xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div/div/div/div[2]/section/div/div/div/div[1]/div/div/div/div/article/div/div[2]/div[2]/div[2]/div[1]/div/span')

#Extract the text of the tweet and replace line breaks
mars_weather_tweet = mars_weather_tweet_obj[0].text.replace('\n',', ')

#Close browser
driver.quit()

#Check result
mars_weather_tweet

'InSight sol 529 (2020-05-22) low -93.2ºC (-135.8ºF) high 0.6ºC (33.1ºF), winds from the WNW at 4.6 m/s (10.3 mph) gusting to 15.5 m/s (34.6 mph), pressure at 7.10 hPa'

## Mars Facts
Get table of Mars facts using Pandas

In [138]:
#Send Pandas to read tables from URL
mars_facts_url = 'https://space-facts.com/mars/'
mars_facts = pd.read_html(mars_facts_url)

#Grab the first table of facts, add column headings
mars_facts_df = mars_facts[0]
mars_facts_df.columns = ['Parameter', 'Fact']

#Write as HTML table
mars_facts_df.to_html('mars_facts_table.html', index=False)

## Mars Hemispheres

Create a list of dictionaries containing the URL's and Titles for images of Mars' hemispheres

In [139]:
#Setup Splinter Browsder and target URL
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
mars_hemis_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'

#Go to URL that summarizes the Mars hemispheres.
browser.visit(mars_hemis_url)

#Grab the HTML
html2 = browser.html
browser.quit()

In [140]:
#Use BeautifulSoup to parse the HTML
soup2 = BeautifulSoup(html2, 'html.parser')

#Find the URL tag for each hemisphere's separate page
hemi_links = soup2.find_all('a', class_='itemLink')

#Build a list of the full URL for each hemisphere's separate page so we can go there to find the link to download the full size image.
full_urls = []
for link in hemi_links:
    full_url = 'https://astrogeology.usgs.gov/'+link['href']
    full_urls.append(full_url)

#Remove duplicates from the URL list
full_urls = list(dict.fromkeys(full_urls))

#Check that we have a working URL for each hempisphere page
full_urls

['https://astrogeology.usgs.gov//search/map/Mars/Viking/cerberus_enhanced',
 'https://astrogeology.usgs.gov//search/map/Mars/Viking/schiaparelli_enhanced',
 'https://astrogeology.usgs.gov//search/map/Mars/Viking/syrtis_major_enhanced',
 'https://astrogeology.usgs.gov//search/map/Mars/Viking/valles_marineris_enhanced']

In [142]:
#Setup Splinter browser
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

#Initialize the list of dictionaries that will hold each hemisphere's title and link to full size image download
mars_hems_dict_lst = []

#For each hemispher URL
for i in full_urls:
    #Go to the individual webpage of that hemisphere
    browser.visit(i)
    #Grab the HTML
    html3 = browser.html
    #Use BeautifulSoup to parse the HTML
    soup3 = BeautifulSoup(html3, 'html.parser')
    #Find the link for the Original tif photo download (not the sample JPG)
    image_link = soup3.find('a', string='Original')
    image_link = image_link['href']
    #Find the title or name of the hemisphere
    image_title = soup3.find('h2', class_='title')
    #Remove unneeded wording at the end of the title
    image_title = image_title.text.replace(' Enhanced', '')
    #Create a dictionary of the title and link for that hemisphere
    temp_dict = {'title': image_title, 'img_url': image_link}
    #Add the dictionary to the list
    mars_hems_dict_lst.append(temp_dict)

browser.quit()
#Check the final result
mars_hems_dict_lst

[{'title': 'Cerberus Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif'}]