In [1]:
# Import Splinter and Beautiful Soup.
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Set up Splinter.
executable_path = {"executable_path" : ChromeDriverManager().install()}
browser = Browser("chrome", **executable_path, headless = False)

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.46M/6.46M [00:00<00:00, 12.6MB/s]


In [3]:
# Visit the Mars News site for Scraping.
url = "https://redplanetscience.com"
browser.visit(url)

# Optional delay for loading the page.
browser.is_element_present_by_css("div.list_text", wait_time = 1)

True

In [4]:
# Parse the HTML.
html = browser.html
news_soup = soup(html, "html.parser")

In [5]:
# Scrape the articles on the landing page.
articles = news_soup.find_all("div", class_ = "list_text")

In [9]:
# Set up the article dictionary and initialize the list of articles.
mars_article = {"title" : "", "preview" : ""}
article_list = []

# Put the article titles and preview texts together into a dictionary.
for article in articles:
    # Get the article title.
    title = article.find("div", class_ = "content_title").text
    mars_article["title"] = title
    
    # Get the article preview.
    preview = article.find("div", class_ = "article_teaser_body").text
    mars_article["preview"] = preview
    
    # Add this dictionary into the list
    article_list.append(mars_article)

In [10]:
# Print the results.
for x in article_list:
    print(x)

{'title': "NASA's MAVEN Maps Winds in the Martian Upper Atmosphere that Mirror the Terrain Below and Gives Clues to Martian Climate", 'preview': 'Researchers have created the first map of wind circulation in the upper atmosphere of a planet besides Earth, using data from NASA’s MAVEN spacecraft that were collected during the last two years.'}
{'title': "NASA's MAVEN Maps Winds in the Martian Upper Atmosphere that Mirror the Terrain Below and Gives Clues to Martian Climate", 'preview': 'Researchers have created the first map of wind circulation in the upper atmosphere of a planet besides Earth, using data from NASA’s MAVEN spacecraft that were collected during the last two years.'}
{'title': "NASA's MAVEN Maps Winds in the Martian Upper Atmosphere that Mirror the Terrain Below and Gives Clues to Martian Climate", 'preview': 'Researchers have created the first map of wind circulation in the upper atmosphere of a planet besides Earth, using data from NASA’s MAVEN spacecraft that were coll

In [11]:
# Close the Splinter session.
browser.quit()

In [None]:
# Export the list of dictionaries into a JSON file.
import json

jsonString = json.dumps(article_list)
jsonFile = open("article_list.json", "w")
jsonFile.write(jsonString)
jsonFile.close()

The list of dictionaries can be imported into a Mongo database collection using two different methods. The first method uses the Mongo CLI to import the recently created JSON file into the Mongo database collection.

1. Start Mongo by running `mongod` for Windows, or `brew services start mongodb/brew/mongodb-community` for Mac. (This will need to be done regardless of the method used.)
2. In the terminal, use `cd` to navigate to the resources folder that contains the file named `article_list.json`.
3. Import this file to a Mongo database using this command:

`mongoimport --type json -d mars_news -c article_list --drop --jsonArray article_list.json`

This command tells Mongo that it needs to:

    * import a json file (`--type json`)
    * to a database called \"mars_news\" (`-d mars_news`)
    * in a collection called \"article_list\" (`-c article_list`)
    * treat the input source as a json array (`--array`)
    * removing the existing \"article_list\" collection (`--drop`), if it exists, before adding the new documents from the json file.

The other method inserts the list of dictionaries into the Mongo database collection using the script in the next cells below.

In [None]:
# Create an instance of MongoClient, using the port number 27017.
from pymongo import MongoClient

mongo = MongoClient(port = 27017)

In [None]:
# Set up a database named "mars_news."
db = mongo["mars_news"]

# Set up a collection named "article_list."
collect = db["article_list"]

# Insert the list of dictionaries.
collect.insert_many(article_list)

Regardless of the method used, there should now be a Mongo database named `mars_news` with a collection named `article_list`.

In [None]:
# Verify existence of the database.
print(mongo.list_database_names())

In [None]:
# Verify existence of the collection.
db = mongo["mars_news"]

print(db.list_collection_names())

In [None]:
# Verify that all documents are accounted for.
collect = db["article_list"]

for result in collect.find():
    print(result)

OPTIONAL: When finished, clean up everything.

In [None]:
# Delete the collection.
db.drop_collection("article_list")
db.list_collection_names()

In [None]:
# Delete the database.
mongo.drop_database(db)
mongo.list_database_names()