In [1]:
# Scraping techniques

In [2]:
## Scraping interactive webpages

In [3]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install splinter

Note: you may need to restart the kernel to use updated packages.


In [5]:
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd

In [6]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280


 


[WDM] - Driver [/Users/june/.wdm/drivers/chromedriver/mac64/87.0.4280.88/chromedriver] found in cache


In [7]:
## Scrape the recent news'{title and content}

In [8]:
# Visit the NASA Mars'news website
scrape_data = {}
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
print(browser.title)

News – NASA’s Mars Exploration Program


In [9]:
# Add Optional delay to load the webpage, and to check if the webpage exists
browser.is_element_present_by_tag("ul.item_list, li.slide", wait_time = 1)

True

In [10]:
# Convert the browser.html into a BeautifulSoup object
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [11]:
# Scrape the recent Mars news from the website
first_element = soup.select_one("ul.item_list , li.slide")
#print(first_element)

In [12]:
# Scrape the title of the recent news
first_element_title = first_element.find('div', class_="content_title").get_text()
scrape_data['title'] = first_element_title
print(first_element_title)

NASA Moves Forward With Campaign to Return Mars Samples to Earth


In [13]:
# Scrape the paragraph of the recent news
first_element_content = first_element.find('div', class_="article_teaser_body").get_text()
scrape_data['content'] = first_element_content
print(first_element_content)

During this next phase, the program will mature critical technologies and make critical design decisions as well as assess industry partnerships.


In [14]:
## Scrape the FULL-IMAGE URL from the given link

In [15]:
url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(url)
full_image= browser.find_by_id("full_image")
full_image.click()
browser.is_element_present_by_text('more info', wait_time=1)
more_info = browser.links.find_by_partial_text('more info')
more_info.click()

In [16]:
# Convert the browser.html into a BeautifulSoup object
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [17]:
img_url = soup.select_one('figure.lede a img').get("src")
#img_url = f'https://www.jpl.nasa.gov{img_url_rel}'
img_url = 'https://www.jpl.nasa.gov' + img_url
scrape_data['image_url'] = img_url
print(img_url)

https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16220_hires.jpg


In [18]:
## Scrape the Mars facts table as HTML table from the given link

In [19]:
url = 'http://space-facts.com/mars/'
df = pd.read_html(url)[0]
#df.head()
df.columns=['Description', 'Value']
#df.set_index('Description', inplace=True)
print(df)
html_table = df.to_html()
scrape_data['table'] = html_table

            Description                          Value
0  Equatorial Diameter:                       6,792 km
1       Polar Diameter:                       6,752 km
2                 Mass:  6.39 × 10^23 kg (0.11 Earths)
3                Moons:            2 (Phobos & Deimos)
4       Orbit Distance:       227,943,824 km (1.38 AU)
5         Orbit Period:           687 days (1.9 years)
6  Surface Temperature:                   -87 to -5 °C
7         First Record:              2nd millennium BC
8          Recorded By:           Egyptian astronomers


In [20]:
## Scrape the Mars hemisphere URLs from the given link

In [21]:
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)

In [22]:
hemisphere_info = []
# Identify all the hemispheres from the URL
links = browser.find_by_css("a.product-item h3")
# Iterate over the links to scrape the hemispheres' title and URL
for i in range(len(links)):
    hemisphere = {}
    browser.find_by_css("a.product-item h3")[i].click()
    image = browser.links.find_by_text('Sample').first
    hemisphere['img_url'] = image['href']
    hemisphere['title'] = browser.find_by_css("h2.title").text
    hemisphere_info.append(hemisphere)
    # Return to the previous page to scrape the next hemisphere's link
    browser.back()
scrape_data['hemisphere_info'] = hemisphere_info   
print(hemisphere_info)

[{'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/cerberus_enhanced.tif/full.jpg', 'title': 'Cerberus Hemisphere Enhanced'}, {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/schiaparelli_enhanced.tif/full.jpg', 'title': 'Schiaparelli Hemisphere Enhanced'}, {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/syrtis_major_enhanced.tif/full.jpg', 'title': 'Syrtis Major Hemisphere Enhanced'}, {'img_url': 'https://astropedia.astrogeology.usgs.gov/download/Mars/Viking/valles_marineris_enhanced.tif/full.jpg', 'title': 'Valles Marineris Hemisphere Enhanced'}]


In [23]:
print(scrape_data)

{'title': 'NASA Moves Forward With Campaign to Return Mars Samples to Earth', 'content': 'During this next phase, the program will mature critical technologies and make critical design decisions as well as assess industry partnerships.', 'image_url': 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16220_hires.jpg', 'table': '<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Description</th>\n      <th>Value</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Equatorial Diameter:</td>\n      <td>6,792 km</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Polar Diameter:</td>\n      <td>6,752 km</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>Mass:</td>\n      <td>6.39 × 10^23 kg (0.11 Earths)</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>Moons:</td>\n      <td>2 (Phobos &amp; Deimos)</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>Orbit Distance:</td>\n      <td>

In [24]:
browser.quit()