# In this project we scrape data on Mars from NASA.

In [1]:
import os

from bs4 import BeautifulSoup as bs
import requests


import pandas as pd

## In this section we scrape the NASA Mars news site:
https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest

In [2]:
nasa_mars_file_path = os.path.join("NewsNASAMarsExplorationProgram", "News_NASA_Mars_Exploration_Program.html")
nasa_mars_html = open(nasa_mars_file_path, "r").read()

In [3]:
nasa_news_site_soup = bs(nasa_mars_html, "html.parser")

In [4]:
nasa_news_article_titles_html = nasa_news_site_soup.find_all("div", class_="content_title")
nasa_news_article_paragraphs_html = nasa_news_site_soup.find_all("div", class_="article_teaser_body")

In [5]:
nasa_news_article_titles_list = []
nasa_news_article_paragraphs_list = []

In [6]:
# for nasa_news_article_title, nasa_news_article_paragraph in nasa_news_article_titles_html, nasa_news_article_paragraphs_html:
#     try:
#         nasa_news_article_titles.append(nasa_news_article_title.find("a").text.strip())
#         nasa_news_article_paragraphs.append(nasa_news_article_paragraph.text.replace("\n", ""))
        
#     except:
#         print("Error")

In [7]:
for nasa_news_article_title in nasa_news_article_titles_html:
    try:
        nasa_news_article_titles_list.append(nasa_news_article_title.find("a").text.strip())

        
    except:
        print("Error")

In [8]:
# nasa_news_article_titles_list

In [9]:
for nasa_news_article_paragraph in nasa_news_article_paragraphs_html:
    try:
        nasa_news_article_paragraphs_list.append(nasa_news_article_paragraph.text.replace("\n", ""))
        
    except:
        print("Error")

In [10]:
nasa_news_article_paragraphs_list

["On Nov. 26, NASA's InSight spacecraft will blaze through the Martian atmosphere and set a lander gently on the surface in less time than it takes to cook a hard-boiled egg.",
 "NASA's Mars InSight spacecraft is on track for a soft touchdown on the surface of the Red Planet on Nov. 26.",
 "In studying Mars, NASA's InSight will reveal what makes one planet more or less suitable for life than another.",
 'After a five-year search, NASA has chosen Jezero Crater as the landing site for its upcoming Mars 2020 rover mission.',
 'NASA will host a media teleconference at 9 a.m. PST (noon EST) Monday, Nov. 19, to provide details about the Mars 2020 roverâ€™s landing site on the Red Planet.',
 'On Nov. 26, engineers will look for a combination of signals to determine whether the next spacecraft to Mars lands safely.',
 "NASA's InSight lander is scheduled to touch down on the Red Planet at approximately noon PST  on Nov. 26, with a new suite of instruments to probe below the Martian surface.",
 

In [11]:
dict(zip(nasa_news_article_titles_list, nasa_news_article_paragraphs_list))

{'NASA InSight Landing on Mars: Milestones': "On Nov. 26, NASA's InSight spacecraft will blaze through the Martian atmosphere and set a lander gently on the surface in less time than it takes to cook a hard-boiled egg.",
 'NASA InSight Team on Course for Mars Touchdown': "NASA's Mars InSight spacecraft is on track for a soft touchdown on the surface of the Red Planet on Nov. 26.",
 'What Two Planetary Siblings Can Teach Us About Life': "In studying Mars, NASA's InSight will reveal what makes one planet more or less suitable for life than another.",
 'NASA Announces Landing Site for Mars 2020 Rover': 'After a five-year search, NASA has chosen Jezero Crater as the landing site for its upcoming Mars 2020 rover mission.',
 'NASA to Host Media Call on Next Mars Landing Site': 'NASA will host a media teleconference at 9 a.m. PST (noon EST) Monday, Nov. 19, to provide details about the Mars 2020 roverâ€™s landing site on the Red Planet.',
 'How NASA Will Know When InSight Touches Down': 'On N

## In this section we scrape the URL for a featured image

In [40]:
site_with_featured_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
featured_image_site = requests.get(featured_image_url)
featured_image_soup = bs(featured_image_site.text, "html.parser")

In [13]:
featured_image_soup

<!DOCTYPE html>

<!--[if IE 9]> <html class="no-js ie ie9" lang="en"> <![endif]-->
<!--[if IE 8]> <html class="no-js ie ie8" lang="en"> <![endif]-->
<html>
<!-- START HEADER: "DEFAULT" -->
<head>
<meta charset="utf-8"/>
<!-- Always force latest IE rendering engine or request Chrome Frame -->
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/> <title>Space Images</title>
<link href="/assets/stylesheets/manifest.css" media="all" rel="stylesheet" type="text/css"/>
<link href="/assets/stylesheets/print.css" media="print" rel="stylesheet" type="text/css"/>
<script src="/assets/javascripts/public_manifest.js" type="text/javascript"></script>
<script src="/assets/javascripts/vendor/jquery.fancybox.js" type="text/javascript"></script>
<script src="/assets/javascripts/vendor/jquery.fancybox-thumbs.js" type="text/javascript"></script>
</head>
<body class="dark_background logge

In [14]:
featured_image_url_extension = featured_image_soup.find("a", class_=["button", "fancybox"]).get("data-fancybox-href")
featured_image_url_extension

'/spaceimages/images/mediumsize/PIA19168_ip.jpg'

In [41]:
featured_image_url = site_with_featured_image_url + featured_image_url_extension
featured_image_url

'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars/spaceimages/images/mediumsize/PIA19168_ip.jpg'

## In this section we scrape tweets from the @MarsWxReport Twitter account: https://twitter.com/marswxreport?lang=en

In [15]:
mars_twitter_page = requests.get("https://twitter.com/marswxreport?lang=en")
mars_twitter_page

<Response [200]>

In [16]:
mars_twitter_page_soup = bs(mars_twitter_page.text, "html.parser")

In [17]:
mars_twitter_page_latest_tweet = mars_twitter_page_soup.find("p", class_="TweetTextSize").text
mars_twitter_page_latest_tweet

'Sol 2242 (2018-11-26), high -2C/28F, low -70C/-93F, pressure at 8.48 hPa, daylight 06:29-18:45'

## In this section we use pandas to directly scrape data from the Mars Facts website: https://space-facts.com/mars/

In [18]:
mars_space_facts_url = "https://space-facts.com/mars/"

In [19]:
mars_space_facts_tables = pd.read_html(mars_space_facts_url)

In [20]:
mars_space_facts_tables

[                      0                              1
 0  Equatorial Diameter:                       6,792 km
 1       Polar Diameter:                       6,752 km
 2                 Mass:  6.42 x 10^23 kg (10.7% Earth)
 3                Moons:            2 (Phobos & Deimos)
 4       Orbit Distance:       227,943,824 km (1.52 AU)
 5         Orbit Period:           687 days (1.9 years)
 6  Surface Temperature:                  -153 to 20 °C
 7         First Record:              2nd millennium BC
 8          Recorded By:           Egyptian astronomers]

In [21]:
type(mars_space_facts_tables)

list

In [22]:
len(mars_space_facts_tables)

1

In [23]:
mars_space_facts_tables[0]

Unnamed: 0,0,1
0,Equatorial Diameter:,"6,792 km"
1,Polar Diameter:,"6,752 km"
2,Mass:,6.42 x 10^23 kg (10.7% Earth)
3,Moons:,2 (Phobos & Deimos)
4,Orbit Distance:,"227,943,824 km (1.52 AU)"
5,Orbit Period:,687 days (1.9 years)
6,Surface Temperature:,-153 to 20 °C
7,First Record:,2nd millennium BC
8,Recorded By:,Egyptian astronomers


In [24]:
mars_space_facts_html_table_string = mars_space_facts_tables[0].to_html()

In [25]:
mars_space_facts_html_table_string

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>0</th>\n      <th>1</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.42 x 10^23 kg (10.7% Earth)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>227,943,824 km (1.52 AU)</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>Orbit Period:</td>\n      <td>687 days (1.9 years)</td>\n    </tr>\n    <tr>\n      <th>6</th>\n      <td>Surface Temperature:</td>\n      <td>-153 to 20 °C</td>\n    </tr>\n    <tr>\n      <th>7</th>\n      <td>First Record:</td>\n      <td>2nd millennium BC</td>\n    

## In this section we visit the USGS Astrogeology site to obtain high resolution images for each of Mars's hemispheres.

In [26]:
cerebrus_hemisphere_site = requests.get("https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced")
cerebrus_hemisphere_site

<Response [200]>

In [27]:
cerebrus_hemisphere_soup = bs(cerebrus_hemisphere_site.text, "html.parser")

In [28]:
schiaparelli_hemisphere_site = requests.get("https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced")
schiaparelli_hemisphere_site

<Response [200]>

In [29]:
schiaparelli_hemisphere_soup = bs(schiaparelli_hemisphere_site.text, "html.parser")

In [30]:
syrtis_major_hemisphere_site = requests.get("https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced")
syrtis_major_hemisphere_site

<Response [200]>

In [31]:
syrtis_major_hemisphere_soup = bs(syrtis_major_hemisphere_site.text, "html.parser")

In [32]:
valles_marineris_hemisphere_site = requests.get("https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced")
valles_marineris_hemisphere_site

<Response [200]>

In [33]:
valles_marineris_hemisphere_soup = bs(valles_marineris_hemisphere_site.text, "html.parser")

In [34]:
mars_hemisphere_soups_list = [cerebrus_hemisphere_soup, schiaparelli_hemisphere_soup, syrtis_major_hemisphere_soup, valles_marineris_hemisphere_soup]

In [35]:
mars_hemisphere_images_list = [mars_hemisphere.find("div", class_="downloads") for mars_hemisphere in mars_hemisphere_soups_list]

In [36]:
mars_hemisphere_image_urls_list = [mars_hemisphere_image.find("a").get("href") for mars_hemisphere_image in mars_hemisphere_images_list]
mars_hemisphere_image_urls_list

['http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg']

In [37]:
mars_hemisphere_names_list = ["Cerberus", "Schiaparelli", "Syrtis_Major", "Valles_Marineris"] 

In [38]:
mars_hemisphere_name_and_image_urls_dictionary = dict(zip(mars_hemisphere_names_list, mars_hemisphere_image_urls_list))
mars_hemisphere_name_and_image_urls_dictionary

{'Cerberus': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg',
 'Schiaparelli': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg',
 'Syrtis_Major': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg',
 'Valles_Marineris': 'http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg'}

In [39]:
# mars_hemisphere_name_and_image_urls_dictionary = {"Name": hemisphere_name, "image_url": hemisphere_image_url \
#                                                   for hemisphere_name, hemisphere_image_url \
#                                                   in mars_hemisphere_names_list, mars_hemisphere_image_urls_list}