In [1]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
import requests, sys, io, re
import pandas as pd

# Scraping

In [38]:
nasa_news_url="https://mars.nasa.gov/news/"

In [39]:
browser = Browser('chrome', headless=True)
browser.visit(nasa_news_url)

In [40]:
html = browser.html

In [41]:
soup=bs(html,'lxml')

In [42]:
latest_list_item = soup.find("div", class_="list_text")

In [43]:
news_title = latest_list_item.find("div", class_="content_title").text
news_p = latest_list_item.find("div", class_="article_teaser_body").text

print(f"{news_title} : {news_p}")

Robotic Toolkit Added to NASA's Mars 2020 Rover : The bit carousel, which lies at the heart of the rover's Sample Caching System, is now aboard NASA's newest rover. 


# JPL Mars Space Images - Featured Image

In [44]:
## https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars
base_url= "https://www.jpl.nasa.gov"
mars_images_link = f"{base_url}/spaceimages/?search=&category=Mars"

In [45]:
browser = Browser('chrome', headless=True)
browser.visit(mars_images_link)

In [46]:
html = browser.html

In [47]:
soup=bs(html,'lxml')

In [48]:
result = soup.find("footer")

In [49]:
relative_img_url=result.find("a")["data-link"]
print(relative_img_url)

/spaceimages/details.php?id=PIA16726


In [50]:
data_link = f"{base_url}{relative_img_url}"
print (data_link)

https://www.jpl.nasa.gov/spaceimages/details.php?id=PIA16726


In [51]:
#browser = Browser('chrome', headless = True)
browser.visit(data_link)

In [52]:
html = browser.html
soup = bs(html, 'lxml')

In [53]:
hires_image_url=soup.find("article").find("figure", class_="lede").find("a")["href"]
# /spaceimages/images/largesize/PIA17175_hires.jpg


In [54]:
featured_image_url = f"{base_url}{hires_image_url}"
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16726_hires.jpg


# Mars Weather

In [2]:
twitter_link = "https://twitter.com/marswxreport?lang=en"

In [3]:
response = requests.get(twitter_link)

In [4]:
print(response)

<Response [200]>


In [5]:
soup = bs(response.text, 'lxml')

In [6]:
results = soup.find_all('div', class_="content")
mars_weather=""

In [7]:
for result in results:
    tweet_text = result.find("p", class_ = "tweet-text")
    tweet_link = result.find("a", class_ = "twitter-timeline-link")
    tweet_date = result.find("span", class_ = "_timestamp")
    if "InSight sol" in result.text:
        #print("~" * 100)
        if tweet_link:
            #print(f"[{tweet_date.text}] => {tweet_text.text} \n{tweet_link.text}")
            #print("Modified String")
            #print(tweet_text.text.replace(tweet_link.text,""))
            mars_weather=tweet_text.text.replace(tweet_link.text,"")
            break
        else:
            print(f"[{tweet_date.text}] => {tweet_text.text}")
            break
print(mars_weather)

InSight sol 258 (2019-08-18) low -100.0ºC (-148.1ºF) high -26.2ºC (-15.2ºF)
winds from the SSE at 5.3 m/s (11.9 mph) gusting to 16.8 m/s (37.6 mph)
pressure at 7.60 hPa


# Mars Facts

In [8]:
mars_facts_url = "https://space-facts.com/mars/"

In [9]:
tables = pd.read_html(mars_facts_url)
print(type(tables))

<class 'list'>


In [10]:
df=tables[1]
df.rename(columns={0:"Mars Statistic Type", 1:"Mars Statistic Value"}, inplace = True)

In [11]:
html_buf=io.StringIO()

In [29]:
df.to_html(buf=html_buf, classes="table table-bordered table-hover", table_id="id_table_data", justify="inherit")

In [30]:
data_html_fp=open("mars_statistics_data.html","w")
data_html_fp.write(html_buf.getvalue())
data_html_fp.close()

# Mars Hemispheres

In [31]:
mars_hemispheres_pics_url = "https://web.archive.org/web/20181114171728/https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"

In [32]:
response = requests.get(mars_hemispheres_pics_url)

In [33]:
soup = bs(response.text, 'lxml')

In [34]:
hemisphere_image_urls = {}
regex = re.compile("^.+https")

for hemi_div in soup.find_all("div", class_="item"):
    
    ### The actual MARS hemiphere page link is available as target of the a tag.
    ### Since the source URL is a web archvive URL extra archive url string is 
    ### appended before the actual URL Link. So I used regex string to replace
    ### all strings till the last set of HTTPS characters.
    
    hemi_a = hemi_div.find("a")
    hemi_full_page_url = "https"+re.sub(regex, "", hemi_a["href"]) 
    
    #print(hemi_full_page_url)
    
    ### Parse the generated URL and analyze the response to get the 
    ### hemisphere title and hemisphere image link
    
    response= requests.get(hemi_full_page_url)
    hemi_soup = bs(response.text, 'lxml')

    title = hemi_soup.find("h2").text
    
    #print(title)
    
    ### There are 2 locations for getting the image location
    ### One in the downloads section with link text of value "Original"
    ### Second is in the contents. 
    ### I have chosen to parse the a tag and identify the href attribute 
    ### for  the a tag if link text is "Original".
    ### Once the image URL is identified the parsing can be stopped.
    
    for links in hemi_soup.find_all("a"):
        if links.text == "Original":
            #print(links["href"])
            hemisphere_image_urls[title]=links["href"]
            continue

In [35]:
for title, url in hemisphere_image_urls.items():
    print(f"Title = {title}\nURL = {url}\n\n")

Title = Cerberus Hemisphere Enhanced
URL = http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif


Title = Schiaparelli Hemisphere Enhanced
URL = http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif


Title = Syrtis Major Hemisphere Enhanced
URL = http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif


Title = Valles Marineris Hemisphere Enhanced
URL = http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif




In [36]:
browser.quit()