In [1]:
from bs4 import BeautifulSoup as bs
from splinter import Browser
import requests, sys, io, re
import pandas as pd

# Scraping

In [2]:
nasa_news_url="https://mars.nasa.gov/news/"

In [3]:
browser = Browser('chrome', headless=True)
browser.visit(nasa_news_url)

In [4]:
html = browser.html

In [5]:
soup=bs(html,'lxml')

In [6]:
latest_list_item = soup.find("div", class_="list_text")

In [7]:
news_title = latest_list_item.find("div", class_="content_title").text
news_p = latest_list_item.find("div", class_="article_teaser_body").text

print(f"{news_title} : {news_p}")

Robotic Toolkit Added to NASA's Mars 2020 Rover : The bit carousel, which lies at the heart of the rover's Sample Caching System, is now aboard NASA's newest rover. 


In [8]:
browser.quit()

# JPL Mars Space Images - Featured Image

In [9]:
## https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars
base_url= "https://www.jpl.nasa.gov"
mars_images_link = f"{base_url}/spaceimages/?search=&category=Mars"

In [10]:
browser = Browser('chrome', headless=True)
browser.visit(mars_images_link)

In [11]:
html = browser.html

In [12]:
soup=bs(html,'lxml')

In [13]:
result = soup.find("footer")

In [14]:
relative_img_url=result.find("a")["data-link"]
print(relative_img_url)

/spaceimages/details.php?id=PIA19643


In [15]:
data_link = f"{base_url}{relative_img_url}"
print (data_link)

https://www.jpl.nasa.gov/spaceimages/details.php?id=PIA19643


In [16]:
#browser = Browser('chrome', headless = True)
browser.visit(data_link)

In [17]:
html = browser.html
soup = bs(html, 'lxml')

In [18]:
hires_image_url=soup.find("article").find("figure", class_="lede").find("a")["href"]
# /spaceimages/images/largesize/PIA17175_hires.jpg


In [19]:
featured_image_url = f"{base_url}{hires_image_url}"
print(featured_image_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA19643_hires.jpg


# Mars Weather

In [20]:
twitter_link = "https://twitter.com/marswxreport?lang=en"

In [21]:
response = requests.get(twitter_link)

In [22]:
print(response)

<Response [200]>


In [23]:
soup = bs(response.text, 'lxml')

In [24]:
results = soup.find_all('div', class_="content")
mars_weather=""

In [25]:
for result in results:
    tweet_text = result.find("p", class_ = "tweet-text")
    tweet_link = result.find("a", class_ = "twitter-timeline-link")
    tweet_date = result.find("span", class_ = "_timestamp")
    if "InSight sol" in result.text:
        #print("~" * 100)
        if tweet_link:
            #print(f"[{tweet_date.text}] => {tweet_text.text} \n{tweet_link.text}")
            #print("Modified String")
            #print(tweet_text.text.replace(tweet_link.text,""))
            mars_weather=tweet_text.text.replace(tweet_link.text,"")
            break
        else:
            print(f"[{tweet_date.text}] => {tweet_text.text}")
            break
print(mars_weather)

InSight sol 261 (2019-08-21) low -102.4ºC (-152.4ºF) high -26.6ºC (-15.8ºF)
winds from the SSE at 4.9 m/s (11.0 mph) gusting to 16.0 m/s (35.8 mph)
pressure at 7.70 hPa


# Mars Facts

In [26]:
mars_facts_url = "https://space-facts.com/mars/"

In [27]:
tables = pd.read_html(mars_facts_url)
print(type(tables))

<class 'list'>


In [28]:
df=tables[1]
df.rename(columns={0:"Mars Statistic Type", 1:"Mars Statistic Value"}, inplace = True)

In [29]:
html_buf=io.StringIO()

In [30]:
df.to_html(buf=html_buf, classes="table table-bordered table-hover", table_id="id_table_data", justify="inherit")

In [31]:
data_html_fp=open("mars_statistics_data.html","w")
data_html_fp.write(html_buf.getvalue())
data_html_fp.close()

# Mars Hemispheres

In [32]:
base_url = "https://astrogeology.usgs.gov"
search_hemi_path = "/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
mars_hemispheres_pics_url = base_url + search_hemi_path

In [33]:
response = requests.get(mars_hemispheres_pics_url)

In [34]:
print(response)

<Response [200]>


In [35]:
soup = bs(response.text, 'lxml')

In [36]:
hemisphere_image_urls = {}
regex = re.compile("^.+https")

for hemi_div in soup.find_all("div", class_="item"):
    
    ### The actual MARS hemiphere page link is available as target of the a tag.
    ### Since the source URL is a web archvive URL extra archive url string is 
    ### appended before the actual URL Link. So I used regex string to replace
    ### all strings till the last set of HTTPS characters.
    
    hemi_a = hemi_div.find("a")
    #hemi_full_page_url = "https"+re.sub(regex, "", hemi_a["href"]) 
    hemi_full_page_url = base_url + hemi_a["href"]
    print(hemi_full_page_url)
    
    ### Parse the generated URL and analyze the response to get the 
    ### hemisphere title and hemisphere image link
    
    response= requests.get(hemi_full_page_url)
    hemi_soup = bs(response.text, 'lxml')

    title = hemi_soup.find("h2").text
    
    #print(title)
    
    ### There are 2 locations for getting the image location
    ### One in the downloads section with link text of value "Original"
    ### Second is in the contents. 
    ### I have chosen to parse the a tag and identify the href attribute 
    ### for  the a tag if link text is "Original".
    ### Once the image URL is identified the parsing can be stopped.
    
    for links in hemi_soup.find_all("a"):
        if links.text == "Original":
            #print(links["href"])
            hemisphere_image_urls[title]=links["href"]
            continue

https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced
https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced


In [37]:
for title, url in hemisphere_image_urls.items():
    print(f"Title = {title}\nURL = {url}\n\n")

Title = Cerberus Hemisphere Enhanced
URL = http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif


Title = Schiaparelli Hemisphere Enhanced
URL = http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif


Title = Syrtis Major Hemisphere Enhanced
URL = http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif


Title = Valles Marineris Hemisphere Enhanced
URL = http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif




In [38]:
browser.quit()

# Alternate way for scrapping the images link

In [39]:
alt_hemisphere_image_urls = []
image_record = {}

In [40]:
hemi_browser=Browser('chrome', headless = True)
hemi_browser.visit(mars_hemispheres_pics_url)

In [41]:
hemi_html = hemi_browser.html
hemi_soup = bs(hemi_html, 'lxml')

In [42]:
#### First we determine the links where the actual full sized images file are available
#### Once the links are determined the code open different browser window pointing to each
#### hemisphere image URL. The handle to these browser tabs are stored in a list

url_link=[]
for hemi_image in hemi_soup.find_all("div", class_="item"):
    hemi_a = hemi_image.find("a")
    individual_hemi_url = base_url+hemi_a["href"]
    print(f"Hemisphere Detail URL ==> {individual_hemi_url}")
    child_browser = Browser('chrome', headless = True)
    child_browser.visit(individual_hemi_url)
    url_link.append(child_browser)

Hemisphere Detail URL ==> https://astrogeology.usgs.gov/search/map/Mars/Viking/cerberus_enhanced
Hemisphere Detail URL ==> https://astrogeology.usgs.gov/search/map/Mars/Viking/schiaparelli_enhanced
Hemisphere Detail URL ==> https://astrogeology.usgs.gov/search/map/Mars/Viking/syrtis_major_enhanced
Hemisphere Detail URL ==> https://astrogeology.usgs.gov/search/map/Mars/Viking/valles_marineris_enhanced


In [43]:
#### Above we had generated a list with handles to all Chrome browser tabs with each tab
#### opening the link to the full sized hemisphere image URL. 

#### From each open browser window, the code extracts the HTML content to get the title
#### of the hemisphere and link to the full sized image URL. Once these are obtained
#### all child browser tabs are programmatically closed.

for child_browser in url_link:
    image_record = {}
    child_soup = bs(child_browser.html, 'lxml')
    child_title = child_soup.find("h2").text
    #print(child_title)
    for a_records in child_soup.find_all("a"):
        if a_records.text == "Original":
            #print(a_records.text)
            #print(a_records["href"])
            child_url = a_records["href"]
            image_record["name"] = child_title
            image_record["image_link"] = child_url
            alt_hemisphere_image_urls.append(image_record)
            break
    child_browser.quit()

In [44]:
for record in alt_hemisphere_image_urls:
    print(f"{record['name']} ==>> {record['image_link']}")

Cerberus Hemisphere Enhanced ==>> http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif
Schiaparelli Hemisphere Enhanced ==>> http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif
Syrtis Major Hemisphere Enhanced ==>> http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif
Valles Marineris Hemisphere Enhanced ==>> http://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif


In [45]:
hemi_browser.quit()