In [1]:
# Dependencies
from bs4 import BeautifulSoup as bs
import pymongo
import requests
import pandas as pd
import re
from splinter import Browser

In [2]:
# I define my home page urls for the four sites here. Some will be modified but some can be used directly. 

nasa_url = 'https://mars.nasa.gov/news/8744/nasa-engineers-checking-insights-weather-sensors/'
mars_facts_url = 'https://space-facts.com/mars/'
astro_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'

## Basic Soup Scrape for Title and Paragraph - Nasa Page

In [8]:
# I created a soup scrape function to scrape and convert the html.
def soup_scrape(url, db_name):
    response = requests.get(url)
    return(bs(response.text, 'lxml'))

In [9]:
# The first scrape is for the Nasa homepage to get the latest title and content. 

nasa_scrape = soup_scrape(nasa_url, 'nasa_db')

In [10]:
# I saved my title and paragraphs texts below

nasa_title = nasa_scrape.title.text

# I noticed that the "intro" paragraph was marked with an <i> tag. And the homework example just included the <i>. So instead of pulling all the <p> elements, I just grapped the <i> elements to put into my scrape.
# To get the text from these elements I needed to use the getText method on each individual element. However, getText is not compatible with find_all(), only fin() so I made a loop and then created my text list. 
#That filtered out the html <> elements without me needing to manually do that.


for string in nasa_scrape.find_all('i'):
    nasa_paragraph = ""
    nasa_paragraph += string.getText()
    print (nasa_paragraph)


:
An electronics issue is suspected to be preventing the sensors from sharing their data about Mars weather with the spacecraft.
On Sept. 6, 2020, InSight's weather sensors (collectively called the Auxiliary Payload Sensor Suite, or APSS) were reset. They appear to be operating nominally again, gathering data on wind speed and direction, air temperature and pressure, and magnetic fields. Although the issue that required APSS to be reset has not been determined, the team will continue to carefully monitor the situation.


## Splinter Scrape for Images - JPL Mars Images

In [3]:
# For splinter I installed chromedriver in the same file as the python nb. I need to define the executable path to chromedriver and then give it the link. This sets up the browser visit.
# The next cell of text then conducts the scrape for the images. 

executable_path = {"executable_path": 'chromedriver.exe'}
jpl_browser = Browser('chrome', **executable_path, headless=False)

In [4]:
# Define the url and browser visit.
url = jpl_url
jpl_browser.visit(url)

In [5]:
# This proved to be the hardest scrape. I found that the full image was in the wall paper text and on a carousel rotating list. I could use beautiful soup to find the carousel_html where the image path was stored. But the url was
# not easily extractable because it wasn't directly tied to a tag. There may be a better way to do this, but I was able to use regex to extract the necessary url. I basically converted where
# I found the carousel_html into a string and then did a regex pattern search on that html code to extract the wallpaper/full image path

html = jpl_browser.html
soup = bs(html, 'html.parser')

carousel_html = soup.find('div', 'carousel_items', 'style')
carousel_string = str(carousel_html)

# Close the browser after scraping
jpl_browser.quit()

<div class="carousel_items">
<article alt="Sun Shines in High-Energy X-rays" class="carousel_item" style="background-image: url('/spaceimages/images/wallpaper/PIA18906-1920x1200.jpg');">
<div class="default floating_text_area ms-layer">
<h2 class="category_title">
</h2>
<h2 class="brand_title">
				  FEATURED IMAGE
				</h2>
<h1 class="media_feature_title">
				  Sun Shines in High-Energy X-rays				</h1>
<div class="description">
</div>
<footer>
<a class="button fancybox" data-description="X-rays stream off the sun in this first picture of the sun, overlaid on a picture taken by NASA's Solar Dynamics Observatory, taken by NuSTAR." data-fancybox-group="images" data-fancybox-href="/spaceimages/images/mediumsize/PIA18906_ip.jpg" data-link="/spaceimages/details.php?id=PIA18906" data-title="Sun Shines in High-Energy X-rays" id="full_image">
					FULL IMAGE
				  </a>
</footer>
</div>
<div class="gradient_container_top"></div>
<div class="gradient_container_bottom"></div>
</article>
</div>


In [14]:
#pattern = "url('(.*?)');"
search_pattern = "(?<=spaceimages).*?(?=.jpg)"
url_img = re.search(search_pattern, carousel_string).group(0)
print(url_img)

/images/wallpaper/PIA18906-1920x1200


In [15]:
relative_image_path = '/spaceimages/images/wallpaper/PIA08097-1920x1200.jpg'
mars_image = "https://www.jpl.nasa.gov" + "/spaceimages" + url_img + ".jpg"
mars_image

'https://www.jpl.nasa.gov/spaceimages/images/wallpaper/PIA18906-1920x1200.jpg'

## Basic Pandas Scrape for Tabular Data - Mars Facts Website

In [77]:
# The Mars facts table is probably the easiest scrape as we can just use Pandas to read the html and extract the tabular data which we shall save in mars_table
# I found the Pandas to_html to be very easy to work with. So what I did was a read the html in Pandas. Then I converted the list it pulled into a Pandas table. 
# I then took the tabular data I needed and converted it to a record dictionary so I could pull the elements into my index.html file.
# I know a more sophisticated way I could do this would be to convert the table back to html and then upload the html directly. I looked at some jquery ways to do it
# but decided to just do it manually because I needed to finish this assignment. 

mars_table = pd.read_html(mars_facts_url)
mars_facts = pd.DataFrame(mars_table[0])
mars_facts.columns=["Fact", "Value"]
mars_facts.set_index(["Fact"])
mars_facts_dict = mars_facts.to_dict('record')
mars_facts_dict

[{'Fact': 'Equatorial Diameter:', 'Value': '6,792 km'},
 {'Fact': 'Polar Diameter:', 'Value': '6,752 km'},
 {'Fact': 'Mass:', 'Value': '6.39 × 10^23 kg (0.11 Earths)'},
 {'Fact': 'Moons:', 'Value': '2 (Phobos & Deimos)'},
 {'Fact': 'Orbit Distance:', 'Value': '227,943,824 km (1.38 AU)'},
 {'Fact': 'Orbit Period:', 'Value': '687 days (1.9 years)'},
 {'Fact': 'Surface Temperature:', 'Value': '-87 to -5 °C'},
 {'Fact': 'First Record:', 'Value': '2nd millennium BC'},
 {'Fact': 'Recorded By:', 'Value': 'Egyptian astronomers'}]

## Mars Hemisphere Images

In [11]:
#The final task is also fairly straight forward. I just needed to find where the hemisphere image urls were stored and then create a dictionary so they could be added to my final index.html template.
cerberus_url = 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'
schiaparelli_url = 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'
syrtis_major_url = 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'
valles_marineris_url = 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'

mars_hemi_images = {}

In [12]:
cerberus = {'title': 'Cerberus', 'img_url': cerberus_url}
syrtis_mjr = {'title': 'Syrtis Major', 'img_url': syrtis_major_url}
vales_marineris = {'title': 'Valles Marineris', 'img_url': valles_marineris_url}
schiaparelli = {'title': 'Schiaparelli', 'img_url': schiaparelli_url}
mars_hemi_images['cerberus'] = cerberus
mars_hemi_images['syrtis_mjr'] = syrtis_mjr
mars_hemi_images['vales_marineris'] = vales_marineris
mars_hemi_images['schiaparelli'] = schiaparelli
mars_hemi_images

{'cerberus': {'title': 'Cerberus',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 'syrtis_mjr': {'title': 'Syrtis Major',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 'vales_marineris': {'title': 'Valles Marineris',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'},
 'schiaparelli': {'title': 'Schiaparelli',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}}

In [13]:
print(mars_hemi_images)

{'cerberus': {'title': 'Cerberus', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'}, 'syrtis_mjr': {'title': 'Syrtis Major', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'}, 'vales_marineris': {'title': 'Valles Marineris', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}, 'schiaparelli': {'title': 'Schiaparelli', 'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'}}


In [14]:
mars_hemi_images['cerberus']['img_url']

'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'

In [133]:
#load dictionary to be loaded to flask app

mars_dict = {}
mars_dict.update(mars_hemi_images)
mars_dict['mars_facts_dict']= mars_facts_dict
mars_dict['nasa_title'] = nasa_title
mars_dict['nasa_paragraph'] = nasa_paragraph
mars_dict

{'cerberus': {'title': 'Cerberus',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 'syrtis_mjr': {'title': 'Syrtis Major',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 'vales_marineris': {'title': 'Valles Marineris',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'},
 'schiaparelli': {'title': 'Schiaparelli',
  'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 'mars_facts_dict': [{'Fact': 'Equatorial Diameter:', 'Value': '6,792 km'},
  {'Fact': 'Polar Diameter:', 'Value': '6,752 km'},
  {'Fact': 'Mass:', 'Value': '6.39 × 10^23 kg (0.11 Earths)'},
  {'Fact': 'Moons:', 'Value': '2 (Phobos & Deimos)'},
  {'Fact': 'Orbit Distance:', 'Value': '227,943,824 km (1.38 AU)'},
  {'Fact': 'Orbit Period:', 'Value': '687 days (1.9 years)'},
