In [78]:
# Libraries
from bs4 import BeautifulSoup as bs
import pandas as pd
import pymongo
import re
import requests
from splinter import Browser

In [2]:
# Set up Splinter browser
cd_fp = "C:/Users/norma/Anaconda3/chromedriver.exe"
browser = Browser('chrome', cd_fp, headless=True)

In [3]:
# Establish a Mongo database connection
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.MarsDB

## NASA Mars News

In [12]:
# Visit the site
url = 'https://mars.nasa.gov/news/'
browser.visit(url)

# Make Soup
soup = bs(browser.html, 'html.parser')

# Write out a pretty version of the body, for visual inspection.
body = soup.find('body')
with open('news2.html','w') as f:
    f.write(body.prettify())

In [13]:
# Find desired content
first_article = soup.find('li', class_ = 'slide')
#print(first_article)
nasa_title = first_article.find('h3').text.strip()
print(nasa_title)
nasa_abstract = first_article.find('div', class_ = 'article_teaser_body').text.strip()
print(nasa_abstract)

How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus
Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.


#### Experiment:  Would the request library be a more straightforward way to get the HTML?

In [None]:
# Persist the desired content

# I could have used "replace", but:
#    On the first round, I would have to manually insert a document.
#    Who knows what has tranpired in my database since last used it?
result = db.News.drop()
result = db.News.insert_one({'title':nasa_title,'abstract':nasa_abstract})

## JPL Mars Space Images - Featured Image

In [18]:
# Visit the site
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)

# I sadly went down the path of using soup to find anchors, buttons, or hrefs, and trying to "click()" soup elements.
# By trial and error and error and error, and with a code sample from Jeab, I realized that clicking links was a
# feature of the Splinter browser, not BeautifulSoup.

# soup = bs(browser.html, 'html.parser')
# body = soup.find('body')
# with open('jpl_body.html','w') as f:
#     f.write(body.prettify())

In [20]:
# By a considerable amount of poking and inspecting in the human way, 
# I found that this pattern of clicks led to the large image.

browser.links.find_by_partial_text('FULL IMAGE').click()
    
browser.links.find_by_partial_text('more info').click()

# Make Soup
soup = bs(browser.html, 'html.parser')
# body = soup3.find('body')
# with open('jpl_more_info.html','w') as f:
#     f.write(body.prettify())
    
featured_image_url = 'https://www.jpl.nasa.gov' + soup.find('figure').find('a')['href']

In [21]:
# Persist the desired content
result = db.Image.drop()
result = db.Image.insert_one({'url':featured_image_url})

## Mars Weather

In [101]:
# Visit the site
url = 'http://twitter.com/marswxreport?lang=en'
browser.visit(url)

In [102]:
# Make Soup
soup = bs(browser.html, 'html.parser', from_encoding="utf-8")

# Struggled with the embedded emoji character, u'\U0001f602'
body = soup.find('body')
#with open('twitter.html','w') as f:
#     f.write(str(body)[0:78200])

# Struggled with a surgical or precise way to find the weather
# Find desired content
mars_weather = ''
spans = body.find_all('span')
for span in spans:
    if 'InSight sol' in span.text:
        mars_weather = span.text.strip()
        break
        
print(mars_weather)

InSight sol 504 (2020-04-27) low -93.6ºC (-136.5ºF) high -6.7ºC (20.0ºF)
winds from the SW at 4.6 m/s (10.3 mph) gusting to 15.0 m/s (33.6 mph)
pressure at 6.80 hPa


In [103]:
# Persist the desired content
result = db.Weather.drop()
result = db.Weather.insert_one({'conditions':mars_weather})

## Mars Facts

In [88]:
# Pull tables from the site
url = 'https://space-facts.com/mars/'
dfs = pd.read_html(url)

# Try the first table
df_mars_facts = dfs[0]
df_mars_facts.columns = ['Attribute','Value']
df_mars_facts

Unnamed: 0,Attribute,Value
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.39 × 10^23 kg (0.11 Earths)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.38 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-87 to -5 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [89]:
# Produce table HTML
html = df_mars_facts.to_html(index=False, justify='center', classes=['table','table-sm','table-striped'])
my_re = re.compile('\n\s+')
html_cleaned = my_re.sub('', html)

In [90]:
# Persist the desired content
result = db.Facts.drop()
result = db.Facts.insert_one({'table_html':html_cleaned})

## Mars Hemispheres

In [91]:
# Visit the site
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

In [92]:
soup = bs(browser.html,'html.parser')
# body = soup.find('body')
# with open('hemispheres.html','w') as f:
#     f.write(body.prettify())


In [93]:
# Refinement of Jeab's code
# divs = soup.find_all('div', class_='item')
# for div in divs:
#     key_word = div.find('h3').text
#     browser.find_link_by_partial_text(key_word).click()
#     img_url = browser.find_link_by_text('Sample').first['href']
#     print(img_url)
#     browser.back()

In [94]:
# Find the desired content
hemisphere_image_urls = list()
divs = soup.find_all('div', class_='description')
for div in divs:
    key_word = div.find('h3').text
#    print(key_word)
    browser.links.find_by_partial_text(key_word).click()
    
    # Now our browser is on the linked page
    soup2 = bs(browser.html,'html.parser')
#     body = soup.find('body')
#     with open(f'{key_word}.html','w') as f:
#         f.write(body.prettify())
    
    downloads = soup2.find('div', class_='downloads')
    img_url = downloads.find('a')['href']
    
    hemisphere_image_urls.append({'title':key_word.replace(' Enhanced',''),'img_url':img_url})

    # Go back to the original page
    browser.back()
    
hemisphere_image_urls

[{'title': 'Cerberus Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg'},
 {'title': 'Schiaparelli Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg'},
 {'title': 'Syrtis Major Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg'},
 {'title': 'Valles Marineris Hemisphere',
  'img_url': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}]

In [95]:
# Persist the desired content
result = db.Hemispheres.drop()
result = db.Hemispheres.insert_one({'title_and_image_list':hemisphere_image_urls})

## Experiment with the Mongo Database
### Work within a database named MarsDB, with this content layout; Collection Name, followed by dictionary keys

In [99]:
x = db.News.find()
print(type(x))
print(x.next)
print(x[0])

# for y in x:
#     print(y)

<class 'pymongo.cursor.Cursor'>
<bound method Cursor.next of <pymongo.cursor.Cursor object at 0x0000022978016588>>
{'_id': ObjectId('5ea5ce471c775750a4d16b74'), 'title': "How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus", 'abstract': 'Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.'}


In [97]:
mars_info = {'News' : db.News.find_one({}, {'_id':False}),
             'Image' : db.Image.find_one({}, {'_id':False}),
             'Weather' : db.Weather.find_one({}, {'_id':False}),
             'Facts' : db.Facts.find_one({}, {'_id':False}),
             'Hemispheres' : db.Hemispheres.find_one({}, {'_id':False})
            }
mars_info

{'News': {'title': "How NASA's Perseverance Mars Team Adjusted to Work in the Time of Coronavirus",
  'abstract': 'Like much of the rest of the world, the Mars rover team is pushing forward with its mission-critical work while putting the health and safety of their colleagues and community first.'},
 'Image': {'url': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA19643_hires.jpg'},
 'Weather': {'conditions': ''},
 'Facts': {'table_html': '<table border="1" class="dataframe table table-sm table-striped"><thead><tr style="text-align: center;"><th>Attribute</th><th>Value</th></tr></thead><tbody><tr><td>Equatorial Diameter:</td><td>6,792 km</td></tr><tr><td>Polar Diameter:</td><td>6,752 km</td></tr><tr><td>Mass:</td><td>6.39 × 10^23 kg (0.11 Earths)</td></tr><tr><td>Moons:</td><td>2 (Phobos &amp; Deimos)</td></tr><tr><td>Orbit Distance:</td><td>227,943,824 km (1.38 AU)</td></tr><tr><td>Orbit Period:</td><td>687 days (1.9 years)</td></tr><tr><td>Surface Temperature:</td><td>-87 t