# Mission to Mars: Web Scraping

In [1]:
# Dependencies
from bs4 import BeautifulSoup 
import requests
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [2]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)




[WDM] - Current google-chrome version is 102.0.5005
[WDM] - Get LATEST chromedriver version for 102.0.5005 google-chrome
[WDM] - Driver [/Users/jasonnoble/.wdm/drivers/chromedriver/mac64/102.0.5005.61/chromedriver] found in cache


### NASA Mars News

* Scrape the [Mars News Site](https://redplanetscience.com/) and collect the latest News Title and Paragraph Text. Assign the text to variables that you can reference later.

In [3]:
# url to scrape
url = 'https://redplanetscience.com/'

# visit the url in the headless
browser.visit(url)

In [4]:
# grab the html
html = browser.html

# create the soup
soup = BeautifulSoup(html, 'html.parser')

# scrape the titles
titles = soup.find_all('div', class_='content_title')

# grab latest title and assign to variable
news_title = titles[0].text

# scrape the paragraphs
pars = soup.find_all('div', class_='article_teaser_body')

# grab latest paragraph and assign to variable
news_p = pars[0].text

### JPL Mars Space Images—Featured Image

* Visit the URL for the Featured Space Image site [here](https://spaceimages-mars.com).

* Use Splinter to navigate the site and find the image URL for the current Featured Mars Image, then assign the URL string to a variable called `featured_image_url`.

* Be sure to find the image URL to the full-sized `.jpg` image.

* Be sure to save a complete URL string for this image.

In [5]:
# url to scrape
url = 'https://spaceimages-mars.com'

# visit the url in the headless browser
browser.visit(url)

In [6]:
# grab the html
html = browser.html

# create the soup
soup = BeautifulSoup(html, 'html.parser')

# find featured image and assign to variable
featured_image = soup.find(class_='headerimage', src=True)

# extract src from image, build and assign url to variable
featured_image_url = url + '/' + featured_image.get('src')

### Mars Facts

* Visit the [Mars Facts webpage](https://galaxyfacts-mars.com) and use Pandas to scrape the table containing facts about the planet including diameter, mass, etc.

* Use Pandas to convert the data to a HTML table string.

In [7]:
# url to scrape
url = 'https://galaxyfacts-mars.com'

In [8]:
# use pandas read_html to parse the url
tables = pd.read_html(url)

In [9]:
# assign desired table to 'df'
df = tables[0]

In [10]:
# format df

# assign columns
df.columns = ['Description', 'Mars', 'Earth']

# drop row that is no longer needed
df.drop([0], inplace=True)

# set index to 'Description' column
df.set_index('Description', inplace=True)

### Mars Hemispheres

* Visit the [astrogeology site](https://marshemispheres.com/) to obtain high-resolution images for each hemisphere of Mars.

* You will need to click each of the links to the hemispheres in order to find the image URL to the full-resolution image.

* Save the image URL string for the full resolution hemisphere image and the hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys `img_url` and `title`.

* Append the dictionary with the image URL string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere.

In [11]:
# url to scrape
url = 'https://marshemispheres.com/'

# create hemispheres list
hemispheres = ['cerberus', 'schiaparelli', 'syrtis', 'valles']

# loop through hemispheres and build dictionary
hemisphere_image_urls = []
for hemisphere in hemispheres:
    
    # visit url
    browser.visit(url + hemisphere + '.html')
    
    # grab the html
    html = browser.html
    
    # create the soup
    soup = BeautifulSoup(html, 'html.parser')
    
    # grab title
    title = soup.find('h2', class_='title').text
    
    # grab url
    div_w = soup.find('div', class_='wrapper')
    div_c = div_w.find('div', class_='container')
    div_wi = div_c.find('div', class_='wide-image-wrapper')
    div_d = div_wi.find('div', class_='downloads')
    link = div_d.find('a')
    img_url = url + link['href']
    
    # create dictionary and append to list
    hemisphere_image_urls.append({'title': title, 'img_url': img_url})

In [12]:
# close the headless browser
browser.quit()

In [14]:
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg'}]