In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import pymongo
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/99.0.4844.51/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\stade\.wdm\drivers\chromedriver\win32\99.0.4844.51]


In [4]:
#Define Database and collections
db = client.mars_db
collection = db.lastest

In [5]:
# URL for latest articles to be scrape
url = "https://redplanetscience.com/"

In [6]:
# Dynamically loaded page so it needs to use driver to pull the full source code.  With just soup
# it only pulls the static page information.

browser.visit(url)
html = browser.html


In [7]:
# Use driver html and parse ; parse with 'html.parser'
soup = BeautifulSoup(html, 'html.parser')


In [8]:
# Find the par tof the page that holds the information we need.
# div class = list_text             Main div 
# div class = content_title         sub div title
# div class = article_teaser_body   sub div paragraph

results = soup.find_all('div', class_='list_text')
print(len(results))

15


In [9]:
# Loop over the data that was collected and pull title and paragraph infromation from it

for result in results:
    try:
        # scrape article title
        news_title = result.find('div', class_='content_title').text
        # Scrape the div sub paragraph
        news_p = result.find('div', class_='article_teaser_body').text
        print( news_title)
        print(news_p)
        print("/n/n")

        data = {
            'title': news_title,
            'news_p': news_p
        }

    except Exception as e:
        print(e)

NASA's Mars 2020 Heads Into the Test Chamber
In this time-lapse video taken at JPL, engineers move the Mars 2020 rover into a large vacuum chamber for testing in Mars-like environmental conditions.
/n/n
The Extraordinary Sample-Gathering System of NASA's Perseverance Mars Rover
Two astronauts collected Moon rocks on Apollo 11. It will take three robotic systems working together to gather up the first Mars rock samples for return to Earth.
/n/n
Media Get a Close-Up of NASA's Mars 2020 Rover
The clean room at NASA's Jet Propulsion Laboratory was open to the media to see NASA's next Mars explorer before it leaves for Florida in preparation for a summertime launch.
/n/n
NASA's Mars 2020 Rover Tests Descent-Stage Separation
A crane lifts the rocket-powered descent stage away from NASA's Mars 2020 rover after technicians tested the pyrotechnic charges that separate the two spacecraft.
/n/n
Independent Review Indicates NASA Prepared for Mars Sample Return Campaign
NASA released an independent

In [10]:
browser.quit()

## Part Two
### JPL Mars Space Images - Featured Image

In [11]:
url_jpl = "https://spaceimages-mars.com"

In [12]:
# Setup splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)
browser.visit(url_jpl)
html2 = browser.html
soup_jpl = BeautifulSoup(html2, 'html.parser')



Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/99.0.4844.51/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\stade\.wdm\drivers\chromedriver\win32\99.0.4844.51]


In [13]:
# browser.links.find_by_partial_text('FULL IMAGE').click()
results_jpl = soup_jpl.findAll('img', class_='headerimage fade-in')
for image in results_jpl:
    part_url = image['src']

featured_image_url = url_jpl + '/' + part_url
print(featured_image_url)


https://spaceimages-mars.com/image/featured/mars1.jpg


In [14]:
browser.quit()

## Part 3
### Mars Facts

In [15]:
# Input the URL for mars fact table
url_facts = 'https://galaxyfacts-mars.com/'

In [16]:
# Pandas needs to read the table and then verify output.

tables = pd.read_html(url_facts)


In [17]:
# Confirm that data was pulled
df = tables[0]
df.head()

Unnamed: 0,0,1,2
0,Mars - Earth Comparison,Mars,Earth
1,Diameter:,"6,779 km","12,742 km"
2,Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
3,Moons:,2,1
4,Distance from Sun:,"227,943,824 km","149,598,262 km"


In [18]:
# Drop unneeded columns
df.drop(2, axis=1)

Unnamed: 0,0,1
0,Mars - Earth Comparison,Mars
1,Diameter:,"6,779 km"
2,Mass:,6.39 × 10^23 kg
3,Moons:,2
4,Distance from Sun:,"227,943,824 km"
5,Length of Year:,687 Earth days
6,Temperature:,-87 to -5 °C


## Part 4
### Mars Hemispheres

In [53]:
# Url to inspect
url_hemi = 'https://marshemispheres.com'
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)




Current google-chrome version is 99.0.4844
Get LATEST chromedriver version for 99.0.4844 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/99.0.4844.51/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\stade\.wdm\drivers\chromedriver\win32\99.0.4844.51]


In [54]:
# Open URL so we can click links
# Need to click on the object 
# <a href="cerberus.html" class="itemLink product-item"><h3>Cerberus Hemisphere Enhanced</h3> </a>
# Then the open button on the next page - neeed a loop
browser.visit(url_hemi)
# html_list = browser.html
# soup_list = BeautifulSoup(html_list, 'html.parser')
# find_list = soup.find_all('a', class_='itemLink product-item')
# for link in find_list:
#     print(link.href)

In [55]:
# For loop for the 4 links we need to click on for the first page
import time
for x in range(4):
    # html = browser.html
    # soup = BeautifulSoup(html, 'html.parser')
    time.sleep(.5)
    browser.links.find_by_partial_text('Enhanced')[x].click()
    html_temp = browser.html
    soup = BeautifulSoup(html_temp, 'html.parser')
    time.sleep(.5)
    find_link = browser.find_link_by_partial_text('Sample')
    print(find_link['href'] + " " + find_link['title'])
    # browser.links.find_by_partial_text('Sample').click()
    # browser.windows[1].close()

    browser.back()



https://marshemispheres.com/images/full.jpg 
https://marshemispheres.com/images/schiaparelli_enhanced-full.jpg 
https://marshemispheres.com/images/syrtis_major_enhanced-full.jpg 
https://marshemispheres.com/images/valles_marineris_enhanced-full.jpg 


In [56]:
browser.quit()