# This file scrapes a Mars news website

In [17]:
# All the imports
from bs4 import BeautifulSoup as bs
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [18]:
# Sets up splinter, which allows me to run Chrome sessions
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.68M/6.68M [00:00<00:00, 95.6MB/s]


This grabs the first headline and teaser from the redplanetscience.com website.

In [5]:
# Generates a session to visit the webpage
url = 'https://redplanetscience.com/'
browser.visit(url)

True

In [6]:
# Converts the browser visit html to soup
html = browser.html
soup = bs(html, 'html.parser')

In [7]:
# to understand the html, paste it out to find the title and paragraph text
print(soup.prettify())

<html>
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link crossorigin="anonymous" href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.0-beta1/dist/css/bootstrap.min.css" integrity="sha384-giJF6kkoqNQ00vy+HMDP7azOuL0xtbfIcaT9wjKHr8RbDVddVHyTfAAsrekwKmP1" rel="stylesheet"/>
  <link href="css/font.css" rel="stylesheet" type="text/css"/>
  <link href="css/app.css" rel="stylesheet" type="text/css"/>
  <link crossorigin="anonymous" href="https://pro.fontawesome.com/releases/v5.10.0/css/all.css" integrity="sha384-AYmEC3Yw5cVb3ZcuHtOA93w35dYTsvhLPVnYs9eStHfGJvOvKxVfELGroGkvsg+p" rel="stylesheet"/>
  <title>
   News - Mars Exploration Program
  </title>
 </head>
 <body>
  <div class="col-md-12">
   <div class="row">
    <nav class="navbar navbar-expand-lg navbar-light fixed-top">
     <div class="container-fluid">
      <a class="navbar-brand" href="#">
       <img src="image/nasa.png" width="80"/>
       <span class="logo">
        MA

In [8]:
# Title and teaser are in the list_text div
slide_element = soup.select_one('div.list_text')

In [10]:
# Grabs the first headline
slide_element.find('div', class_='content_title')

<div class="content_title">Mars 2020 Unwrapped and Ready for More Testing</div>

In [11]:
# Grabs just the text string from the div for the first article title
news_title = slide_element.find('div', class_='content_title').get_text()
news_title

'Mars 2020 Unwrapped and Ready for More Testing'

In [12]:
# Grabs just the text string from the div for the first teaser body
news_p = slide_element.find('div', class_='article_teaser_body').get_text()
news_p

"In time-lapse video, bunny-suited engineers remove the inner layer of protective foil on NASA's Mars 2020 rover after it was relocated for testing."

This grabs the image from the https://spaceimages-mars.com/ website.

In [19]:
# Generates a session to visit the webpage
url = 'https://spaceimages-mars.com/'
browser.visit(url)

In [20]:
# This clicks on the full image button for the full image
full_image_link = browser.find_by_tag('button')[1]
full_image_link.click()

In [21]:
# Converts the browser visit html to soup
image_html = browser.html
image_soup = bs(image_html, 'html.parser')

In [23]:
# jsonify the page to extract the right html tags
print(image_soup.prettify())

<html class="fancybox-margin fancybox-lock">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0/css/bootstrap.min.css" rel="stylesheet"/>
  <!-- <link rel="stylesheet" type="text/css" href="css/font.css"> -->
  <link href="css/app.css" rel="stylesheet" type="text/css"/>
  <link href="https://stackpath.bootstrapcdn.com/font-awesome/4.7.0/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
  <title>
   Space Image
  </title>
  <style type="text/css">
   .fancybox-margin{margin-right:16px;}
  </style>
 </head>
 <body>
  <div class="header">
   <nav class="navbar navbar-expand-lg">
    <a class="navbar-brand" href="#">
     <img id="logo" src="image/nasa.png"/>
     <span class="logo">
      Jet Propulsion Laboratory
     </span>
     <span class="logo1">
      California Institute of Technology
     </span>
    </a>
    <button aria-controls="navbarNav" aria-expand

In [24]:
# Locates the image in the soup html
featured_image_url = image_soup.find('img', class_='fancybox-image')
# Concatenates the image + base uURL and returns full URL
featured_image_url = url + featured_image_url['src']
featured_image_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

# Mars Facts: This grabs the table from this URL: https://galaxyfacts-mars.com/

In [26]:
# Grabs the html and turns it into a data frame
MarsDF = pd.read_html('https://galaxyfacts-mars.com/')
MarsDF

[                         0                1                2
 0  Mars - Earth Comparison             Mars            Earth
 1                Diameter:         6,779 km        12,742 km
 2                    Mass:  6.39 × 10^23 kg  5.97 × 10^24 kg
 3                   Moons:                2                1
 4       Distance from Sun:   227,943,824 km   149,598,262 km
 5          Length of Year:   687 Earth days      365.24 days
 6             Temperature:     -87 to -5 °C      -88 to 58°C,
                       0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
 3                Moons:          2 ( Phobos & Deimos )
 4       Orbit Distance:       227,943,824 km (1.38 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                   -87 to -5 °C
 7         First Record:              2nd millennium BC

In [28]:
# There are two tables on the page and we want the first one, index 0.
MarsDF = MarsDF[0]
MarsDF.head()

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"


In [33]:
# Need to clean up the table to make it match the screenshot from the assignment. 
# Set the index as the description and make inplace = true to replace the existing index column.
MarsDF.columns = ['Description', 'Mars', 'Earth']
MarsDF.set_index('Description', inplace=True)
MarsDF.head()

Unnamed: 0_level_0,Mars,Earth
Description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"


In [34]:
# Convert the DF to html
MarsDF.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Mars</th>\n      <th>Earth</th>\n    </tr>\n    <tr>\n      <th>Description</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Mars - Earth Comparison</th>\n      <td>Mars</td>\n      <td>Earth</td>\n    </tr>\n    <tr>\n      <th>Diameter:</th>\n      <td>6,779 km</td>\n      <td>12,742 km</td>\n    </tr>\n    <tr>\n      <th>Mass:</th>\n      <td>6.39 × 10^23 kg</td>\n      <td>5.97 × 10^24 kg</td>\n    </tr>\n    <tr>\n      <th>Moons:</th>\n      <td>2</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>Distance from Sun:</th>\n      <td>227,943,824 km</td>\n      <td>149,598,262 km</td>\n    </tr>\n    <tr>\n      <th>Length of Year:</th>\n      <td>687 Earth days</td>\n      <td>365.24 days</td>\n    </tr>\n    <tr>\n      <th>Temperature:</th>\n      <td>-87 to -5 °C</td>\n      <td>-88 to 58°C</td>\n    </tr>\n  </tbody>

# Pull the hemisphere pictures from https://marshemispheres.com/

In [35]:
# Generates a session to visit the webpage
url = 'https://marshemispheres.com/'
browser.visit(url)

In [36]:
# Converts the browser visit html to soup
hemi_html = browser.html
hemi_soup = bs(hemi_html, 'html.parser')

In [37]:
# Uses soup to create a list of all the class=item parts of the page as that's where the images are.
hemi_soup2 = hemi_soup.find_all('div', class_='item')
hemi_soup2

[<div class="item">
 <a class="itemLink product-item" href="cerberus.html"><img alt="Cerberus Hemisphere Enhanced thumbnail" class="thumb" src="images/39d3266553462198bd2fbc4d18fbed17_cerberus_enhanced.tif_thumb.png"/></a>
 <div class="description">
 <a class="itemLink product-item" href="cerberus.html">
 <h3>Cerberus Hemisphere Enhanced</h3>
 </a>
 <span class="subtitle" style="float:left">image/tiff 21 MB</span><span class="pubDate" style="float:right"></span><br/>
 <p>Mosaic of the Cerberus hemisphere of Mars projected into point perspective, a view similar to that which one would see from a spacecraft. This mosaic is composed of 104 Viking Orbiter images acquired…</p>
 </div>
 <!-- end description -->
 </div>,
 <div class="item">
 <a class="itemLink product-item" href="schiaparelli.html"><img alt="Schiaparelli Hemisphere Enhanced thumbnail" class="thumb" src="images/08eac6e22c07fb1fe72223a79252de20_schiaparelli_enhanced.tif_thumb.png"/></a>
 <div class="description">
 <a class="ite

In [42]:
# Create an empty list to store both titles and image urls
hemisphere_dict = []

# Create a for loop to scroll through the images, grab the URLs, and append them to the list
for x in hemi_soup2:
    
    # Grabs the titles in the h3 text on the main page
    page_title = x.h3.text
    # Locates the image page
    image_links = x.find("a", class_="itemLink product-item")['href']
    browser.visit(f"https://marshemispheres.com/{image_links}")
    # Finds the link witihin that page
    image_url = browser.find_by_text('Sample')['href']
    # Appends both the title and image url to the list to create a dictionary
    hemisphere_dict.append({'title': page_title, 'img_url': image_url})

print (hemisphere_dict)

[{'title': 'Cerberus Hemisphere Enhanced', 'img_url': 'https://marshemispheres.com/images/full.jpg'}, {'title': 'Schiaparelli Hemisphere Enhanced', 'img_url': 'https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg'}, {'title': 'Syrtis Major Hemisphere Enhanced', 'img_url': 'https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg'}, {'title': 'Valles Marineris Hemisphere Enhanced', 'img_url': 'https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg'}]


In [43]:
browser.quit()