In [1]:
###############################################################################
##################### CODE FOR THE BCCP WEB SCRAPING COURSE ###################
############################## JUNE 24 TO 26, 2019 ############################
######################### SECTION ON BROWSER AUTOMATION #######################
###############################################################################

### Location to your browser driver and program file
browser_driver = \
    "C:/Users/kevin/Dropbox/Coding_Templates/Python/selenium/chromedriver.exe"
browser_app = "C:/Program Files (x86)/Google/Chrome/Application/chrome.exe"

### Where to save file?
savefile = \
   "C:/Users/kevin/Documents/GitHub/web_scraping_course/results/bera_events.csv"

###############################################################################
############################## LOAD NEEDED MODULES ############################
###############################################################################

# Show everything in Jupyter notebooks (not just last result)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Load different functions/classes from selenium
# webdriver to start an instance of a webdriver
from selenium import webdriver
# Options to set Chrome options
from selenium.webdriver.chrome.options import Options
# BeautifulSoup to turn source code into navigable Python object
from bs4 import BeautifulSoup
# Pandas to convert to DataFrame
import pandas as pd
# ActionChains to interact with a website through selenium
from selenium.webdriver.common.action_chains import ActionChains
# To use Explicit Waits
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


<h1>Approach</h1>
<ol>
    <li> Load events page
    <li> Loop through elements
        <ol>
            <li> Save date
            <li> Save events details
            <li> Click 'Next' button (If no 'Next' button, exit)
        </ol>
     <li> Turn to DataFrame and save
</ol>

<h2>1. Loading the events page</h2>
<a href="https://www.berlin-econ.de/events">https://www.berlin-econ.de/events</a>

In [2]:
###############################################################################
####################### 1. LOAD PAGE WITH FUTURE EVENTS #######################
###############################################################################
### Start a selenium instance
# Set Chrome options for webdriver
chrome_options = Options()
# Location of browser executable
chrome_options.binary_location = browser_app
### Some other options that might be useful:
## Window size
# chrome_options.add_argument("--window-size=1200,900")
## Headless (Does not show the browser)
# chrome_options.set_headless(headless=True)
# Start webdriver (need to include path to driver)

### Start the driver (this should open an empty browser window)
driver = webdriver.Chrome(browser_driver, options = chrome_options)

In [11]:
# Load the events page
url = "https://www.berlin-econ.de/events"
driver.get(url)

# Get source code
html = driver.page_source
# Turn source code to soup
soup = BeautifulSoup(html, "lxml")

<h2>2. Loop through the results elements</h2>

In [12]:
# Take element with results
results = soup.find("div", class_ = "event-results")
# Loop through children
# Save in dictionary
resdict = {}
for el in results.children:
    # Skip white spaces
    try:
        if el.strip() == "":
            # continue loop with next iteration
            pass
    # Do something else if TypeError
    except TypeError:
        ### If there is a TypeError, the element should be one of three:
        # A date: <div class='event-date-separator'>
        # A list of events: <div class='ui segments'>
        # The page buttons: <div class='ui pagination menu'>
        
        # Take the element class to evaluate what type it is
        divclass = " ".join(el["class"])
        
        ### Date element
        if divclass == "event-date-separator":
            # Take the value in <span class='mobile hidden tablet hidden'>
            date = el.find("span", class_ = "mobile hidden tablet hidden") \
                .text.strip()
        ### Element containing all events for this date
        elif divclass == "ui segments":
            # Take list of events
            eventlist = el.find_all("div", class_ = "ui segment")
            # Loop through events and save
            for event in eventlist:
                # Location
                location = event.find("div", class_ = "ui red ribbon label") \
                    .text.strip()
                # Speaker (some don't have one)
                speaker = event.find("div", class_ = "speaker")
                if speaker != None:
                    speaker = speaker.text.strip()
                # Link and title
                link = event.find("div", class_ = "content").find("a")["href"] \
                    .strip()
                title = event.find("div", class_ = "content").find("a")["title"] \
                    .strip()
                # Other details (some don't have one)
                desc = event.find("div", class_ = "description mobile hidden") 
                if desc != None:
                    desc = desc.text.strip()
                
                # Event type
                evtype = event.find("div", class_ = "ui bottom right attached label") \
                    ["title"].strip()
                
                # Save in dict
                resdict[len(resdict)] = {
                    "date": date,
                    "location": location,
                    "speaker": speaker,
                    "link": link,
                    "title": title,
                    "desc": desc,
                    "evtype": evtype,
                }
        ### If arrived at buttons, press next
        elif divclass == "ui pagination menu":
            # This now requires selenium
            # Lets stop here the first time and see how to do it
            raise Exception("Stop here for now.")
        ### If there is any other type, raise an Exception
        else:
            errmsg = "Undefined HTML element: %s" % el
            raise AssertionError(errmsg)

Exception: Stop here for now.

<h3>2.3 Loading the next page</h3>
<p>Let's interrupt the code here and take a look at how to best
    load the next page of events.</p>

In [13]:
### Find which one will be the next page
# Take the very last button and check if the contents contain "Next"
# If so, use the value of the "data-request-data" attribute of this
# tag
# If it does not contain "Next", then we just loaded the last page and we are done
next_content = el.find_all("a", class_ = "item")[-1]
if "Next" in next_content.text:
    next_page = next_content["data-request-data"]
else:
    pass
next_page

'page:2'

In [14]:
### Scroll the buttons into view
# Find the element for the next page in selenium using XPATH
xpathfind = "//div[@class='ui pagination menu']/" \
    "a[@data-request-data='%s']" % next_page
element = driver.find_element_by_xpath(xpathfind)
# Start ActionChain to control the browser
actions = ActionChains(driver)

In [15]:
# Scroll into view
actions.move_to_element(element).perform()

In [16]:
# Click on the button
actions.click(element).perform()

<h4>ActionChains</h4>
I split the scrolling and clicking into two lines for
didactical purposes, but the same could have been achieved
in one line (then the name Action<i>Chain</i> also makes
more sense):
    <p style="font-family:monospace">actions.move_to_element(element).click(element).perform()</p>

<h4>Explicit Waits</h4>
Sometimes the content can load too slowly and the script would
continue before it is loaded, which can cause problems.
To be on the safe side, you can include an 
<i>explicit wait</i> to wait for one particular HTML element to load
before continuing.

In [17]:
##### Let's wait until the button for the newly loaded page is loaded
# and make sure that it is the right page

### First, put together the xpath to that button
# It should have data-request-data == next_page
# and class == "item active"
xpath = "//a[@class='item active' and @data-request-data='%s']" % next_page

### Second, wait for the element to be loaded
element = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, xpath))
)

<h3>Creating the loop</h3>
Now we can put all this inside a loop that runs until 
there is no more 'Next' button. <p>
<br>
Note that after each click and loading of the new content,
we need to save the source code and convert it into a soup again.
  
<h4>No Explicit Wait</h4>
<p>
    For illustration, let us run the loop once without the ExplicitWait.
    Sometimes, this can cause errors.
<p>

In [18]:
### Reload the events page
url = "https://www.berlin-econ.de/events"
driver.get(url)

# Start an infinite loop:
# Save everything in dictionary
resdict = {}
while True:
    ### Parse current state of source code
    # Get source code
    html = driver.page_source
    # Turn source code to soup
    soup = BeautifulSoup(html, "lxml")
    
    ### Loop through results elements
    # Take element with results
    results = soup.find("div", class_ = "event-results")
    # Loop through children
    for el in results.children:
        # Skip white spaces
        try:
            if el.strip() == "":
                # continue loop with next iteration
                pass
        # Do something else if TypeError
        except TypeError:
            ### If there is a TypeError, the element should be one of three:
            # A date: <div class='event-date-separator'>
            # A list of events: <div class='ui segments'>
            # The page buttons: <div class='ui pagination menu'>

            # Take the element class to evaluate what type it is
            divclass = " ".join(el["class"])

            ### Date element
            if divclass == "event-date-separator":
                # Take the value in <span class='mobile hidden tablet hidden'>
                date = el.find("span", class_ = "mobile hidden tablet hidden") \
                    .text.strip()
            ### Element containing all events for this date
            elif divclass == "ui segments":
                # Take list of events
                eventlist = el.find_all("div", class_ = "ui segment")
                # Loop through events and save
                for event in eventlist:
                    # Location
                    location = event.find("div", class_ = "ui red ribbon label") \
                        .text.strip()
                    # Speaker (some don't have one)
                    speaker = event.find("div", class_ = "speaker")
                    if speaker != None:
                        speaker = speaker.text.strip()
                    # Link and title
                    link = event.find("div", class_ = "content").find("a")["href"] \
                        .strip()
                    title = event.find("div", class_ = "content").find("a")["title"] \
                        .strip()
                    # Other details (some don't have one)
                    desc = event.find("div", class_ = "description mobile hidden") 
                    if desc != None:
                        desc = desc.text.strip()

                    # Event type
                    evtype = event.find("div", class_ = "ui bottom right attached label") \
                        ["title"].strip()

                    # Save in dict
                    resdict[len(resdict)] = {
                        "date": date,
                        "location": location,
                        "speaker": speaker,
                        "link": link,
                        "title": title,
                        "desc": desc,
                        "evtype": evtype,
                    }
            ### If arrived at buttons, press next
            elif divclass == "ui pagination menu":
                ### Find which one will be the next page
                # Take the very last button and check if the contents contain "Next"
                # If so, use the value of the "data-request-data" attribute of this
                # tag
                # If it does not contain "Next", then we just loaded the last page and we are done
                next_content = el.find_all("a", class_ = "item")[-1]
                if "Next" in next_content.text:
                    next_page = next_content["data-request-data"]
                
                    ### Scroll the buttons into view
                    # Find the element for the next page in selenium using XPATH
                    xpathfind = "//div[@class='ui pagination menu']/" \
                        "a[@data-request-data='%s']" % next_page
                    element = driver.find_element_by_xpath(xpathfind)
                    # Start ActionChain to control the browser
                    actions = ActionChains(driver)
                    # Scroll into view and click
                    actions.move_to_element(element).click(element).perform()
                else:
                    # If no next button, set next_page to None
                    next_page = None
                # This should be the last non-whitespace element in results.children
                # and the loop should exit here
            ### If there is any other type, raise an Exception
            else:
                errmsg = "Undefined HTML element: %s" % el
                raise AssertionError(errmsg)
    # If next_page is None, there was no Next button and we are done
    if next_page == None:
        break
    # Else, the infinite loop will restart here

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=75.0.3770.100)
  (Driver info: chromedriver=74.0.3729.6 (255758eccf3d244491b8a1317aa76e1ce10d57e9-refs/branch-heads/3729@{#29}),platform=Windows NT 10.0.17763 x86_64)


<h4>Including the Explicit Wait</h4>

In [19]:
### Reload the events page
url = "https://www.berlin-econ.de/events"
driver.get(url)

# Start an infinite loop:
# Save everything in dictionary
resdict = {}
while True:
    ### Parse current state of source code
    # Get source code
    html = driver.page_source
    # Turn source code to soup
    soup = BeautifulSoup(html, "lxml")
    
    ### Loop through results elements
    # Take element with results
    results = soup.find("div", class_ = "event-results")
    # Loop through children
    for el in results.children:
        # Skip white spaces
        try:
            if el.strip() == "":
                # continue loop with next iteration
                pass
        # Do something else if TypeError
        except TypeError:
            ### If there is a TypeError, the element should be one of three:
            # A date: <div class='event-date-separator'>
            # A list of events: <div class='ui segments'>
            # The page buttons: <div class='ui pagination menu'>

            # Take the element class to evaluate what type it is
            divclass = " ".join(el["class"])

            ### Date element
            if divclass == "event-date-separator":
                # Take the value in <span class='mobile hidden tablet hidden'>
                date = el.find("span", class_ = "mobile hidden tablet hidden") \
                    .text.strip()
            ### Element containing all events for this date
            elif divclass == "ui segments":
                # Take list of events
                eventlist = el.find_all("div", class_ = "ui segment")
                # Loop through events and save
                for event in eventlist:
                    # Location
                    location = event.find("div", class_ = "ui red ribbon label") \
                        .text.strip()
                    # Speaker (some don't have one)
                    speaker = event.find("div", class_ = "speaker")
                    if speaker != None:
                        speaker = speaker.text.strip()
                    # Link and title
                    link = event.find("div", class_ = "content").find("a")["href"] \
                        .strip()
                    title = event.find("div", class_ = "content").find("a")["title"] \
                        .strip()
                    # Other details (some don't have one)
                    desc = event.find("div", class_ = "description mobile hidden") 
                    if desc != None:
                        desc = desc.text.strip()

                    # Event type
                    evtype = event.find("div", class_ = "ui bottom right attached label") \
                        ["title"].strip()

                    # Save in dict
                    resdict[len(resdict)] = {
                        "date": date,
                        "location": location,
                        "speaker": speaker,
                        "link": link,
                        "title": title,
                        "desc": desc,
                        "evtype": evtype,
                    }
            ### If arrived at buttons, press next
            elif divclass == "ui pagination menu":
                ### Find which one will be the next page
                # Take the very last button and check if the contents contain "Next"
                # If so, use the value of the "data-request-data" attribute of this
                # tag
                # If it does not contain "Next", then we just loaded the last page and we are done
                next_content = el.find_all("a", class_ = "item")[-1]
                if "Next" in next_content.text:
                    next_page = next_content["data-request-data"]
                
                    ### Scroll the buttons into view
                    # Find the element for the next page in selenium using XPATH
                    xpathfind = "//div[@class='ui pagination menu']/" \
                        "a[@data-request-data='%s']" % next_page
                    element = driver.find_element_by_xpath(xpathfind)
                    # Start ActionChain to control the browser
                    actions = ActionChains(driver)
                    # Scroll into view and click
                    actions.move_to_element(element).click(element).perform()
                    
                    ### Wait for new button to load
                    ## First, put together the xpath to that button
                    # It should have data-request-data == next_page
                    # and class == "item active"
                    xpath = "//a[@class='item active' and @data-request-data='%s']" % next_page
                    ## Second, wait for the element to be loaded
                    element = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.XPATH, xpath))
                    )
                else:
                    # If no next button, set next_page to None
                    next_page = None
                # This should be the last non-whitespace element in results.children
                # and the loop should exit here
            ### If there is any other type, raise an Exception
            else:
                errmsg = "Undefined HTML element: %s" % el
                raise AssertionError(errmsg)
    # If next_page is None, there was no Next button and we are done
    if next_page == None:
        break
    # Else, the infinite loop will restart here

In [20]:
### Close the browser
driver.quit()

<h1>3. Turn to DataFrame and save</h1>

In [21]:
### Convert the resdict to a DataFrame and look at it
df = pd.DataFrame(resdict).T
df


Unnamed: 0,date,desc,evtype,link,location,speaker,title
0,"Wednesday, 26. June 2019",Germany’s persistent current account surplus r...,DIW Seminar on Macroeconomics and Financial Ma...,https://www.berlin-econ.de/event/growing-like-...,Joan Robinson Room,"Mathias Hoffmann, University of Zurich","Growing Like Germany: Local Public Debt, Local..."
1,"Wednesday, 26. June 2019",,Brown Bag Seminar HU Berlin,https://www.berlin-econ.de/event/to-be-announc...,HU Berlin,"Adrian Ochs, BDPEMS",To be announced
2,"Wednesday, 26. June 2019",,Mathematical Statistics Seminar,https://www.berlin-econ.de/event/to-be-announc...,WIAS Berlin,"Alexandra Suvorikova, Universität Potsdam",To be announced
3,"Wednesday, 26. June 2019",,Berliner Forschungskolloquium Wirtschafts- und...,https://www.berlin-econ.de/event/walking-the-l...,HU Berlin,"Benjamin Zenner, University of Luxembourg",Walking the Line between Supervision and Promo...
4,"Thursday, 27. June 2019",,Finance-Accounting Research Seminar,https://www.berlin-econ.de/event/to-be-announc...,HU Berlin,"Russel Wermers, University of Maryland",To be announced
5,"Thursday, 27. June 2019",,FU Research Seminar in Economics,https://www.berlin-econ.de/event/to-be-announc...,FU Berlin,"Evgeny Yakovlev, New Economic School Moscow",To be announced
6,"Thursday, 27. June 2019",,Berlin Behavioral Economics Colloquium and Sem...,https://www.berlin-econ.de/event/to-be-announc...,WZB Berlin,"Felix Holzmeister, University of Innsbruck",Delegated Decision Making in Finance
7,"Friday, 28. June 2019",(joint with André Stenzel and Peter Schmidt)\n...,Brown Bag Seminar Cluster Industrial Economics,https://www.berlin-econ.de/event/modeling-spat...,Anna J. Schwartz Room,,Consumer Rating Dynamics
8,"Monday, 01. July 2019",Fully booked.,Graduate Center Short Course,https://www.berlin-econ.de/event/macroeconomic...,Anna J. Schwartz Room,"Per Krusell and Kurt Mitman, Stockholm University",Macroeconomics with Heterogenous Agents
9,"Monday, 01. July 2019",,Berlin Applied Micro Seminar (BAMS),https://www.berlin-econ.de/event/to-be-announc...,HU Berlin,"Bettina Siflinger, Tilburg University",To be announced


In [22]:
### Save
df.to_csv(savefile, sep = ";", encoding = "utf-8-sig")

<h1>4. Playing around with some more selenium actions</h1>

In [33]:
###############################################################################
####################### 1. LOAD PAGE WITH FUTURE EVENTS #######################
###############################################################################
### Start a selenium instance
# Set Chrome options for webdriver
chrome_options = Options()
# Location of browser executable
chrome_options.binary_location = browser_app
### Some other options that might be useful:
## Window size
# chrome_options.add_argument("--window-size=1200,900")
## Headless (Does not show the browser)
# chrome_options.set_headless(headless=True)
# Start webdriver (need to include path to driver)

### Start the driver (this should open an empty browser window)
driver = webdriver.Chrome(browser_driver, options = chrome_options)
### Reload the events page
url = "https://www.berlin-econ.de/events"
driver.get(url)

In [34]:
# To send special keys
from selenium.webdriver.common.keys import Keys
### Search for events with "Theory" in the title
# Find the input element
xpathfind = "//div[@class='ui icon input']/input"
element = driver.find_element_by_xpath(xpathfind)
# Start ActionChain to control the browser
actions = ActionChains(driver)
actions.send_keys_to_element(element, "theory").send_keys(Keys.RETURN) \
    .perform()

In [35]:
driver.quit()