In [1]:
#import dependencies
from bs4 import BeautifulSoup
import pandas as pd
from splinter import Browser
import time
import lxml
import pymongo

In [2]:
# https://splinter.readthedocs.io/en/latest/drivers/chrome.html
!which chromedriver

/usr/local/bin/chromedriver


In [3]:
#set up executable path and create splinter browser to load pages
executable_path = {"executable_path": "/usr/local/bin/chromedriver"}
browser = Browser("chrome", **executable_path, headless=False)

In [4]:
#visit nasa's mars news site
url = "https://mars.nasa.gov/news/"
browser.visit(url)
time.sleep(1)

In [5]:
#scrape latest article title/description
html = browser.html
soup = BeautifulSoup(html,'html.parser')

article_list = soup.find("ul",class_="item_list")
news_title = article_list.find("div",class_="content_title").text
news_p = article_list.find("div",class_="article_teaser_body").text
print(news_title)
print(news_p)

After a Reset, Curiosity Is Operating Normally
NASA's Mars rover Curiosity is in good health but takes a short break while engineers diagnose why it reset its computer. 


In [6]:
#visit jpl's mars images site and navigate to featured image
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)
time.sleep(1)
browser.find_by_css('div.carousel_items a.fancybox').click()
time.sleep(1)

In [7]:
#scrape featured image url
html = browser.html
soup = BeautifulSoup(html,'html.parser')
featured_image = soup.find('img',class_='fancybox-image')
featured_image_url = "https://www.jpl.nasa.gov" + str(featured_image).split(" ")[2].split('"')[1]
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA18640_ip.jpg'

In [8]:
#scrape tweet from mars weather twitter account
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
time.sleep(1)
mars_weather = browser.find_by_css('div.js-tweet-text-container p.tweet-text').text
mars_weather

'Curiosity is again operating normally following a boot problem first experienced last Friday. Look for more Gale Crater weather conditions soon.\nhttps://www.jpl.nasa.gov/news/news.php?feature=7339 …'

In [9]:
#scrape table from mars facts
url = "https://space-facts.com/mars/"
browser.visit(url)
time.sleep(1)
html = browser.html
soup = BeautifulSoup(html,'lxml')
table = soup.find_all('table')[0]

mars_facts_df_list = pd.read_html(str(table))

for dictionary in mars_facts_df_list:
    dictionary["description"] = dictionary.pop(0)
    dictionary["value"] = dictionary.pop(1)
mars_facts_dict_list = mars_facts_df_list[0].to_json(orient='records')

In [10]:
url = "https://space-facts.com/mars/"
mars_facts_df = pd.read_html(url)[0]
mars_facts_df = mars_facts_df.rename(index=str, columns={0: "Description", 1: "Value"})
mars_facts_html = mars_facts_df.to_html(index='False',index_names='False')
mars_facts_html

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Description</th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium

In [11]:
mars = {
    "news_title": news_title,
    "news_p": news_p,
    "featured_image_url": featured_image_url,
    "mars_weather": mars_weather,
    "mars_facts": mars_facts_html
}

In [12]:
client = pymongo.MongoClient()
db = client.mars_db
collection = db.mars_collection

In [14]:
#clear out contents of db before inserting
db.collection.delete_many({})

<pymongo.results.DeleteResult at 0x11db6c1c8>

In [15]:
db.collection.insert_one(mars)

<pymongo.results.InsertOneResult at 0x11ca94e48>

In [16]:
print(list(db.collection.find()))

[{'_id': ObjectId('5c7357b17e531b03796c6222'), 'news_title': 'After a Reset, Curiosity Is Operating Normally', 'news_p': "NASA's Mars rover Curiosity is in good health but takes a short break while engineers diagnose why it reset its computer. ", 'featured_image_url': 'https://www.jpl.nasa.gov/spaceimages/images/mediumsize/PIA18640_ip.jpg', 'mars_weather': 'Curiosity is again operating normally following a boot problem first experienced last Friday. Look for more Gale Crater weather conditions soon.\nhttps://www.jpl.nasa.gov/news/news.php?feature=7339 …', 'mars_facts': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Description</th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>M