# Set up Splinter and Beautiful Soup


In [1]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

# Import Pandas
import pandas as pd

## set up Chrome 

In [2]:
#headless=false means that all of the browsers actions will be displayed
# in a Chrome Window so we can see them

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - 

[WDM] - Current google-chrome version is 95.0.4638
[WDM] - Get LATEST driver version for 95.0.4638
[WDM] - Driver [C:\Users\szieg\.wdm\drivers\chromedriver\win32\95.0.4638.54\chromedriver.exe] found in cache


## set up site to scrape

In [3]:
# With the following line, browser.is_element_present_by_css('div.list_text', wait_time=1), 
# we are accomplishing two things.

# One is that we're searching for elements with a specific combination
# of tag (div) and attribute (list_text). As an example, 
# ul.item_list would be found in HTML as <ul class="item_list">.

# Secondly, we're also telling our browser to wait one second before searching for components.
# The optional delay is useful because sometimes dynamic pages take a little while to load, 
# especially if they are image-heavy.

In [4]:
# Visit the mars nasa news site
url = 'https://redplanetscience.com'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.list_text', wait_time=1)

True

## Set up HTML parser/slide_elem

In [5]:
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.list_text')

In [6]:
# Notice how we've assigned slide_elem as the variable to look for the <div /> tag 
# and its descendent (the other tags within the <div /> element)? 
# This is our parent element. This means that this element holds all of the other elements
# within it, and we'll reference it when we want to filter search results even further.
# The . is used for selecting classes, such as list_text, so the code 'div.list_text' 
# pinpoints the <div /> tag with the class of list_text. 

# CSS works from right to left, such as returning the last item on the list 
# instead of the first. Because of this, when using select_one, the first matching 
# element returned will be a <li /> element with a class of slide and
# all nested elements within it.



# Create Variables for Title and Summary Text

In [7]:
# will go to slide_elem (above) for div.list_text
# then look at div and content title

# This variable holds a ton of information, so look inside of that information
# to find this specific data." The data we're looking for is the content title, 
# which we've specified by saying, 
# "The specific data is in a <div /> with a class of 'content_title'."

slide_elem.find('div', class_='content_title')


<div class="content_title">NASA's Mars 2020 Comes Full Circle</div>

In [8]:
#We can see that the output from above gives us the title and the HTML 
# but we only want the text of the title
# use GET_TEXT method to only retrieve the text

# Use the parent element to find the first `a` tag and save it as `news_title`
news_title = slide_elem.find('div', class_='content_title').get_text()
news_title

"NASA's Mars 2020 Comes Full Circle"

In [9]:
# NOTE ON PULLING IN THE SUMMARY OF THE ARTICLE
# IT IS OKAY THAT THE CLASS IS IN THE HTML MULTIPLE TIMES
# If we dig deeper into the code selections, we would return a SPECIFIC
# article and we want to retireve the FIRST article


## .find( ) vs .find_all ( )

In [10]:
# There are two methods used to find tags and attributes with BeautifulSoup:

# .find() is used when we want only the first class and attribute we've specified.
# .find_all() is used when we want to retrieve all of the tags and attributes.

# For example, if we were to use .find_all() instead of .find() when pulling the summary,
# we would retrieve all of the summaries on the page instead of just the first one.

In [11]:
# Same as finding title except from a different class
# Use the parent element to find the paragraph text
news_p = slide_elem.find('div', class_='article_teaser_body').get_text()
news_p

"Aiming to pinpoint the Martian vehicle's center of gravity, engineers took NASA's 2,300-pound Mars 2020 rover for a spin in the clean room at JPL. "

# Retrieve Images

In [12]:
# Visit URL
url = 'https://spaceimages-mars.com'
browser.visit(url)

In [13]:
#Variable 'full_image_elem' to old the data returned by find_by_tag
# Find the tag 'button'
# Run the variable which will find the Full Image button and click it
# Find and click the full image button
full_image_elem = browser.find_by_tag('button')[1]
full_image_elem.click()

In [14]:
# Parse the resulting html with soup
html = browser.html
img_soup = soup(html, 'html.parser')

In [15]:
# Find the relative image url for this image. The image will be different each time
# the page is loaded so we want to retrieve 
#whichever image is displayed (relative image url)

# Tell BeautifulSoup to look inside the <img /> tag for an image with
# a class of fancybox-image. Basically we're saying, "
# This is where the image we want lives—use the link that's inside these tag

img_url_rel = img_soup.find('img', class_='fancybox-image').get('src')
img_url_rel

'image/featured/mars2.jpg'

In [16]:
# the code above is not all that's needed. We also need the 'base url'
# Base URL = url that directs to the site

# img_url is the text for the base url plus the image returned from the code above for 
# img_url_rel

#Use the base URL to create an absolute URL
# Use the base URL to create an absolute URL
img_url = f'https://spaceimages-mars.com/{img_url_rel}'
img_url

'https://spaceimages-mars.com/image/featured/mars2.jpg'

# Collection of facts from a table

In [17]:
# Visit URL
url = 'https://galaxyfacts-mars.com/'
browser.visit(url)

In [18]:
#NOTES on RETRIEVING a TABLE
# TAG <table/> is the main container for the table
# TAG <tbody /> is the body of the table (headers, columns, rows)
# TAG <tr /> is the tag for each table ROW
# TAG <td /> is the information in the COLUMN

# Use Pandas' .read_html() funtion to read ENTIRE TABLE
# Don't forget to import pandas as pd

![NoteTableElementsHTML.png](attachment:NoteTableElementsHTML.png)

## Create dataframe for table

In [25]:
#df =pd.read_html(https://.....) reads the site that contains the data
# and turns it into a dataframe
#df.columns= ['description', 'Mars', 'Earch'] We assign column names
# df.set_index('description', inplace=True) makes description the DF index
# inplace=True means that the updated index will remain in plase, 
# without having to reassign the DataFrame to a new variable.

df = pd.read_html('https://galaxyfacts-mars.com')[0]
df.columns=['description', 'Mars', 'Earth']
df.set_index('description', inplace=True)
df


Unnamed: 0_level_0,Mars,Earth
description,Unnamed: 1_level_1,Unnamed: 2_level_1
Mars - Earth Comparison,Mars,Earth
Diameter:,"6,779 km","12,742 km"
Mass:,6.39 × 10^23 kg,5.97 × 10^24 kg
Moons:,2,1
Distance from Sun:,"227,943,824 km","149,598,262 km"
Length of Year:,687 Earth days,365.24 days
Temperature:,-87 to -5 °C,-88 to 58°C


In [26]:
browser.quit()

# Convert to a Python file (.py)

In [None]:
# Jupyter notebook can be used when developing the web scraping
# however, it cannot be used to automate the proces
# therefore, we must convert the code to a py file