# Web Scraping Homework - Mission to Mars
## Terrence Cummings

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import json
import requests
import pymongo
from splinter import Browser
from selenium import webdriver
import time
import sys
import os

# Step 1 - Scraping
## NASA Mars New

Retrieve the following data from a search of the latest news articles at the NASA Mars News website URL:

1. Publication Date

2. Title

3. Summary paragraph



In [None]:
#Because the search results at the URL are from Javascript use Selenium to scrape the data

#URL for NASA Mars News website. This show 40 articles from a search of the criteria "Latest" and "All Categories".
#Results of the search are generated by Javascript so not viewable in the webpage HTML
url_mars_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest'

#Initialize lists to store Selenium objects
dates = []
titles = []
summarys = []

#Use Selenium to get the needed fields from the JS results
#XPath for tags were found by right-clicking on the tag in the Chrome Inspector tool the Copy XPath
driver = webdriver.Chrome()
driver.get(url_mars_news)

#Add a delay to give the scraper time to acquire the data
time.sleep(10)
dates = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[1]')
titles = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[2]/a')
summarys = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[3]')

# create empty array to store text data extracted from Selenium objects
date_lst = []
title_lst = []
summary_lst = []

# loop over results and extract text from Selenium objects, add to each list
for date in dates:
    article_date = date.text
    date_lst.append(article_date)
for title in titles:
    article_title = title.text
    title_lst.append(article_title)
for summary in summarys:
    article_summary = summary.text
    summary_lst.append(article_summary)

#Make dataframe of NASA Mars Latest News Articles
nasa_mars_articles_df = pd.DataFrame(list(zip(date_lst, title_lst, summary_lst)), columns =['Date', 'Title', 'Summary'])

driver.quit()

In [None]:
#Confirm results of the scraping
nasa_mars_articles_df

## JPL Mars Space Images - Featured Images
Get the URL for the featured image at the website

In [None]:
#Setup Splinter Browsder and target URL
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)
url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

#Go to URL and navigate to page with full size image.
browser.visit(url_jpl)
browser.click_link_by_partial_text('FULL IMAGE')
browser.click_link_by_partial_text('more info')

#Grab the HTM from the webpage with the full size image which contains the link to that image
html = browser.html
browser.quit()


In [None]:
#Use BeautifulSoup to parse the HTML
soup = BeautifulSoup(html, 'html.parser')

#Find the image tag for the main image
main_img = soup.find('img', class_='main_image')

#Extract the source link for the image
main_img_url = main_img['src']

#Build the full URL to the full size featured image
main_img_url_full = 'https://www.jpl.nasa.gov'+main_img_url

#Check result
main_img_url_full

## Mars Weather
Get the latest Mars Weather tweet from Twitter

In [None]:
#Use Selenium because Twitter tweets are populated by JS
url_mars_tweet = 'https://twitter.com/marswxreport?lang=en'
driver = webdriver.Chrome()
driver.get(url_mars_tweet)
time.sleep(1)

#Get the first Mars weather tweet using Xpath in Selenium
mars_weather_tweet_obj = driver.find_elements_by_xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div/div/div/div[2]/section/div/div/div/div[1]/div/div/div/div/article/div/div[2]/div[2]/div[2]/div[1]/div/span')

#Extract the text of the tweet and replace line breaks
mars_weather_tweet = mars_weather_tweet_obj[0].text.replace('\n',', ')

#Close browser
driver.quit()

#Check result
mars_weather_tweet

## Mars Facts
Get table of Mars facts using Pandas

In [107]:
#Send Pandas to read tables from URL
mars_facts_url = 'https://space-facts.com/mars/'
mars_facts = pd.read_html(mars_facts_url)

#Grab the first table of facts, add column headings
mars_facts_df = mars_facts[0]
mars_facts_df.columns = ['Parameter', 'Fact']

#Write as HTML table
mars_facts_df.to_html('mars_facts_table.html', index=False)

## Mars Hemispheres

Create a list of dictionaries containing the URL's and Titles for images of Mars' hemispheres

In [None]:
sys.stdout = open('output.txt','wt')
print(soup.prettify())
