Mission to Mars Challenge

Set up Dependencies and the URL executable path

In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

#using pandas for the .read_html() function
import pandas as pd

In [2]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 90.0.4430
Get LATEST driver version for 90.0.4430
Driver [C:\Users\phili\.wdm\drivers\chromedriver\win32\90.0.4430.24\chromedriver.exe] found in cache


###  Visit the NASA Mars News Site

include the following optional delay line for 2 reasons:

browser.is_element_present_by_css('div.list_text', wait_time=1)

1. the search is for elements with a specific combination of tag (div) and attribute (list_text). As an example, ul.item_list would be found in HTML as <ul class="item_list">.

    
2. it tellls the browser to wait one second before searching for components. The optional delay is useful because sometimes dynamic pages take a little while to load, especially if they are image-heavy.

In [None]:
# Visit the mars nasa news site
# class url replaced - url = 'https://redplanetscience.com'
url = 'https://data-class-mars.s3.amazonaws.com/Mars/index.html'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

In [None]:
# Convert the browser html to a soup object and then quit the browser
html = browser.html
news_soup = soup(html, 'html.parser')

slide_elem = news_soup.select_one('div.list_text')

slide_elem has been set to hold all the data that should be searched.  

find the data "content_title" in the slide_elem variable.

In [None]:
slide_elem.find('div', class_='content_title')

Remove the HTML code comments.  

Do this by chaining .get_text() to the .find() method.  then only the text of the element is returned.

Note:
There are two methods used to find tags and attributes with BeautifulSoup:

- .find() is used when we want only the first class and attribute we've specified.


-  .find_all() is used when we want to retrieve all of the tags and attributes.


For example, if we were to use .find_all() instead of .find() when pulling the summary, we would retrieve all of the summaries on the page instead of just the first one.


In [None]:
# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

In [None]:
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

### Featured Images


In [None]:
# Visit URL
# old class url - url = 'https://spaceimages-mars.com'
url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html'
browser.visit(url)

In [None]:
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

Parse the newly opened page

In [None]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')
img_soup

Find the relative url of the image.  its relative as the image will change with each new story.  the returned URL is only for the image, not the site.

In [None]:
# Find the relative image url
img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

Use an f string to add the base URL to the relative url derived above to get the complete url of the pictures to be scraped.

In [None]:
# Use the base URL to create an absolute URL
# old f string - img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url = f'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{img_url_rel}'
img_url

###  Scrape Mars Data: Just the Facts Ma'am

Pull data from table format 

Use pandas to read the html code and set it into a dataframe.

the index [0] calls for only the first table to be found

In [None]:
# read the first table to be found
# old class data - df = pd.read_html('https://galaxyfacts-mars.com')[0]
df = pd.read_html('https://data-class-mars-facts.s3.amazonaws.com/Mars_Facts/index.html')[0]
df.head()


In [None]:
# assign column headers
df.columns=['description', 'Mars', 'Earth']

#turn the description column into the index and maintain it without having to reassign the variable.
df.set_index('description', inplace=True)
df

In [None]:
df.to_html()

### D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles

#### Hemispheres

In [None]:
# 1. Use browser to visit the URL 
url = 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/index.html'

browser.visit(url)

In [6]:
# 1. Use browser to visit the URL 
url = 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/index.html'

browser.visit(url)


# 2. Create a list to hold the images and titles.
hemisphere_image_urls = []
for i in range (4):
    browser.find_by_css("a.product-item img")[i].click()
    html=browser.html
    img_soup = soup(html, 'html.parser')
    try:
        title_elem = img_soup.find("h2", class_="title").get_text()
        sample_elem = img_soup.find("a", text="Sample").get("href")
        sample_elem = "https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/" + sample_elem
        
    except AttributeError:
        # Image error returns None for better front-end handling
        title_elem = None
        sample_elem = None
    hemisphere = {
    "title": title_elem,
    "img_url": sample_elem}
    hemisphere_image_urls.append(hemisphere)
 
    browser.back()
# return hemisphere_image_urls


# # 3. Write code to retrieve the image urls and titles for each hemisphere.
# html=browser.html
# img_soup = soup(html, 'html.parser')

# images = img_soup.find_all('img')
# images

# for img in images:
#     img_url= img['src']
#     img_title = img['alt']
#     hemisphere_image_urls.append({"url":f"https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/{img_url}", "title":img_title})



In [7]:
# 4. Print the list that holds the dictionary of each image url and title.
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere Enhanced',
  'img_url': 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/images/full.jpg'},
 {'title': 'Schiaparelli Hemisphere Enhanced',
  'img_url': 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/images/schiaparelli_enhanced-full.jpg'},
 {'title': 'Syrtis Major Hemisphere Enhanced',
  'img_url': 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/images/syrtis_major_enhanced-full.jpg'},
 {'title': 'Valles Marineris Hemisphere Enhanced',
  'img_url': 'https://data-class-mars-hemispheres.s3.amazonaws.com/Mars_Hemispheres/images/valles_marineris_enhanced-full.jpg'}]

In [None]:
#5 Quit the browser
browser.quit()