In [1]:
###############################################################################
##################### CODE FOR THE BCCP WEB SCRAPING COURSE ###################
############################## JUNE 24 TO 26, 2019 ############################
############################ SECTION ON HTML PARSING ##########################
###############################################################################

### Where to save file?
savefile = \
   "C:/Users/kevin/Documents/GitHub/web_scraping_course/results/bccp_events.csv"

###############################################################################
############################## LOAD NEEDED MODULES ############################
###############################################################################

# Show everything in Jupyter notebooks (not just last result)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# requests to load URLs
import requests
# BeautifulSoup to turn source code into navigable Python object
from bs4 import BeautifulSoup
# Pandas to convert to DataFrame
import pandas as pd


<h1>Approach</h1>
<ol>
    <li>Load page with list of events
    <li>Find individual events in source code
    <li>Loop through events and save details available
    <li>Turn to DataFrame
    <li>Loop through events, load detailed event pages, and save additional information
</ol>

<h2>1. Load page with list of events</h2>
Basic steps:
<ol>
    <li>Load page and get the source code
    <li>Parse the source code to get a soup object
</ol>

<h3>1.1 Getting the source code</h3>

In [2]:

# URL to BCCP events page
url = "http://www.bccp-berlin.de/events/all-events"
# Load URL
r = requests.get(url)


In [3]:
# Can check if successful (Code 200 means it worked without errors)
r

<Response [200]>

In [4]:
# Get source code
srccode = r.text
# This is now a string containing the entire source code:
srccode

'<!DOCTYPE html>\n<!--[if class="no-js">  <html class="ie ie6 lte9 lte8 lte7 no-js" lang="de"> <![endif]-->\n<!--[if IE 7]>     <html class="ie ie7 lte9 lte8 lte7 no-js" lang="de"> <![endif]-->\n<!--[if IE 8]>     <html class="ie ie8 lte9 lte8 no-js" lang="de"> <![endif]-->\n<!--[if IE 9]>     <html class="ie ie9 lte9 no-js" lang="de"> <![endif]-->\n<!--[if gt IE 9]>  <html class="no-js" lang="de"> <![endif]-->\n<!--[if !IE]><!--> <html class="no-js" lang="de"><!--<![endif]-->\n<head>\n\n<meta charset="utf-8" />\n<!-- \n\tCPS-IT GmbH http://www.cps-it.de/\n\n\tThis website is powered by TYPO3 - inspiring people to share!\n\tTYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.\n\tTYPO3 is copyright 1998-2016 of Kasper Skaarhoj. Extensions are copyright of their respective owners.\n\tInformation and contribution at http://typo3.org/\n-->\n\n<base href="http://www.bccp-berlin.de/" />\n<link rel="shortcut icon" href="/Tem

<h3>1.2 Parsing the source code to get soup</h3>

In [5]:
# Use BeautifulSoup 4 to turn into soup object that allows navigation
soup = BeautifulSoup(srccode, "lxml")
# Look at the soup object:
soup

<!DOCTYPE html>
<!--[if class="no-js">  <html class="ie ie6 lte9 lte8 lte7 no-js" lang="de"> <![endif]--><!--[if IE 7]>     <html class="ie ie7 lte9 lte8 lte7 no-js" lang="de"> <![endif]--><!--[if IE 8]>     <html class="ie ie8 lte9 lte8 no-js" lang="de"> <![endif]--><!--[if IE 9]>     <html class="ie ie9 lte9 no-js" lang="de"> <![endif]--><!--[if gt IE 9]>  <html class="no-js" lang="de"> <![endif]--><!--[if !IE]><!--><html class="no-js" lang="de"><!--<![endif]-->
<head>
<meta charset="utf-8"/>
<!-- 
	CPS-IT GmbH http://www.cps-it.de/

	This website is powered by TYPO3 - inspiring people to share!
	TYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.
	TYPO3 is copyright 1998-2016 of Kasper Skaarhoj. Extensions are copyright of their respective owners.
	Information and contribution at http://typo3.org/
-->
<base href="http://www.bccp-berlin.de/"/>
<link href="/Templates/Master/Resources/Public/Images/favicon.ico" rel="

In [6]:
# Pretty print
print(soup.prettify())

<!DOCTYPE html>
<!--[if class="no-js">  <html class="ie ie6 lte9 lte8 lte7 no-js" lang="de"> <![endif]-->
<!--[if IE 7]>     <html class="ie ie7 lte9 lte8 lte7 no-js" lang="de"> <![endif]-->
<!--[if IE 8]>     <html class="ie ie8 lte9 lte8 no-js" lang="de"> <![endif]-->
<!--[if IE 9]>     <html class="ie ie9 lte9 no-js" lang="de"> <![endif]-->
<!--[if gt IE 9]>  <html class="no-js" lang="de"> <![endif]-->
<!--[if !IE]><!-->
<html class="no-js" lang="de">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <!-- 
	CPS-IT GmbH http://www.cps-it.de/

	This website is powered by TYPO3 - inspiring people to share!
	TYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.
	TYPO3 is copyright 1998-2016 of Kasper Skaarhoj. Extensions are copyright of their respective owners.
	Information and contribution at http://typo3.org/
-->
  <base href="http://www.bccp-berlin.de/"/>
  <link href="/Templates/Master/Resources/Public/Images/f

<h2>2. Search/navigate the soup to find individual events</h2>

In [7]:
# Search soup for all "div" tags whose "class" attribute 
# contains "event-list-item"
divs = soup.find_all("div", class_ = "event-list-item")
# Note normally, the attribute can just be used as an option
# e.g. if it where "id": soup.find_all("div", id = "event-list-item")
# but because "class" coindides with the Python class object, for class
# attributes, you have to use "class_"
# Alternatively: 
# divs = soup.find_all("div", attrs = {"class": "event-list-item"})

# This returns a list containing all div elements that match this structure
# Check the number of events:
len(divs)

20

<h3>2.1 Let's look at an example first</h3>

In [8]:
# Take a look at the first element in the list
div = divs[0]
div

<div class="event-list-item event-type2">
<div class="top-bar">
<span class="date range" title="">June 24, 2019 - June 26, 2019</span>
<span class="b-events__item__type">Conference &amp; Events</span>
</div>
<div class="b-events__item__inner">
<div class="content">
<div class="genres">
						
							Other events
						
					</div>
<h2 class="eventHeader">
<a href="/events/all-events/events-detail/julian-harke-and-kevin-tran/">
					DIW Graduate Center/BCCP Short Course on Web Scraping
				</a>
</h2>
<div class="teaser"></div>
<div class="date">
<strong class="label">Date</strong>
<span>June 25, 2019</span>
</div>
<div class="location">
<strong class="label">Location</strong>
<div class="address">
<span class="name">DIW</span>
<span class="address">Mohrenstr. 58, Room Anna J. Schwartz</span>
<span class="zip">10117</span>
<span class="place">Berlin</span>
</div>
</div>
<div class="time">
<strong class="label">Time</strong>
<span>09:30–12:30</span>
</div>
<div class="date">
<strong class

In [9]:
# Get details and save in dict
divdict = {}
# Date:
date = div.find("span", class_ = "date")
# Note: .find() only takes the first element that matches the structure
# If you are not sure that there is always a unique element that matches,
# a more conservative approach could be:
# Find all elements that fit the search
date = div.find_all("span", class_ = "date")
# Make sure the result is unique
assert len(date) == 1, "%d results for //span[@class='date']" \
    % len(date)
# If there is not exactly one result, this will raise an Exception.
# If it is a unique result, the code continues:
# Take the result (remember the .find_all() yields a list of result)
date = date[0].text.strip()

# Save in dict
divdict["date"] = date

# Look at the date element:
date

'June 24, 2019 - June 26, 2019'

In [10]:
# Save event type
evtype = div.find("span", class_ = "b-events__item__type").text.strip()
divdict["event_type"] = evtype
evtype

'Conference & Events'

In [12]:
# Save seminar series
series = div.find("div", class_ = "genres").text.strip()
divdict["event_series"] = series
series

'Other events'

In [13]:
# Save URL and title
header = div.find("h2", class_ = "eventHeader")
header

<h2 class="eventHeader">
<a href="/events/all-events/events-detail/julian-harke-and-kevin-tran/">
					DIW Graduate Center/BCCP Short Course on Web Scraping
				</a>
</h2>

In [14]:
# Get URL
# Note that the URL is not saved as content but as the value of the "href" attribute
url = header.find("a")["href"]
url 
# Add base URL
url = "http://www.bccp-berlin.de" + url
url

'/events/all-events/events-detail/julian-harke-and-kevin-tran/'

'http://www.bccp-berlin.de/events/all-events/events-detail/julian-harke-and-kevin-tran/'

In [15]:
# Get title
title = header.text.strip()
title
# Save both
divdict["url"] = url
divdict["title"] = title

'DIW Graduate Center/BCCP Short Course on Web Scraping'

In [48]:
# Save topic
topic = div.find("div", class_ = "teaser").text.strip()
topic
divdict["topic"] = topic

''

In [18]:
# Get address
addtag = div.find("div", class_ = "address")
addtag

<div class="address">
<span class="name">DIW</span>
<span class="address">Mohrenstr. 58, Room Anna J. Schwartz</span>
<span class="zip">10117</span>
<span class="place">Berlin</span>
</div>

In [21]:
# Loop through sub "span" elements and save
for span in addtag.find_all("span"):
    # Take content
    val = span.text
    # Use class as variable name (add prefix)
    varname = "loc_" + span["class"][0]
    divdict[varname] = val
divdict
# Note: Approaching it like this automates the creation of the variable
# and makes the code more flexible (e.g. if there are different address
# elements in different events)
# On the other hand, this flexibility increases the probability that
# errors are missed (e.g. if a listing was not correctly loaded,
# this might just create no variable and go on)

{'date': 'June 24, 2019 - June 26, 2019',
 'event_type': 'Conference & Events',
 'event_series': 'Other events',
 'url': 'http://www.bccp-berlin.de/events/all-events/events-detail/julian-harke-and-kevin-tran/',
 'title': 'DIW Graduate Center/BCCP Short Course on Web Scraping',
 'loc_name': 'DIW',
 'loc_address': 'Mohrenstr. 58, Room Anna J. Schwartz',
 'loc_zip': '10117',
 'loc_place': 'Berlin'}

In [22]:
# Get time
time = div.find("div", class_ = "time").find("span").text.strip()
time
divdict["time"] = time

'09:30–12:30'

In [23]:
# Look at divdict
divdict

{'date': 'June 24, 2019 - June 26, 2019',
 'event_type': 'Conference & Events',
 'event_series': 'Other events',
 'url': 'http://www.bccp-berlin.de/events/all-events/events-detail/julian-harke-and-kevin-tran/',
 'title': 'DIW Graduate Center/BCCP Short Course on Web Scraping',
 'loc_name': 'DIW',
 'loc_address': 'Mohrenstr. 58, Room Anna J. Schwartz',
 'loc_zip': '10117',
 'loc_place': 'Berlin',
 'time': '09:30–12:30'}

<h2>3. Loop through events and save details</h2>
We can put the previous codes together and inside a loop to loop over all the
events we found.

In [24]:

# Now put this inside a loop to loop through all events
# Loop through events and save details in dictionary
resdict = {}
for div in divs:
    
    # Get details and save in dict
    divdict = {}
    
    # Date:
    date = div.find_all("span", class_ = "date")
    # Make sure the result is unique
    assert len(date) == 1, "%d results for //span[@class='date single']" \
        % len(date)
    # If there is not exactly one result, this will raise an Exception.
    # If it is a unique result, the code continues:
    # Take the result (remember the .find_all() yields a list of result)
    date = date[0].text.strip()
    # Save in dict
    divdict["date"] = date

    # Save event type
    evtype = div.find("span", class_ = "b-events__item__type").text.strip()
    divdict["event_type"] = evtype

    # Save seminar series
    series = div.find("div", class_ = "genres").text.strip()
    divdict["event_series"] = series

    # Save URL and title
    header = div.find("h2", class_ = "eventHeader")
    # Get URL
    # Note that the URL is not saved as content but as the value of the "href" attribute
    url = header.find("a")["href"] 
    # Add base URL
    url = "http://www.bccp-berlin.de" + url

    # Get title
    title = header.text.strip()

    # Save both
    divdict["url"] = url
    divdict["title"] = title

    # Save topic
    topic = div.find("div", class_ = "teaser").text.strip()
    divdict["topic"] = topic

    # Get address
    addtag = div.find("div", class_ = "address")

    # Loop through sub "span" elements and save
    for span in addtag.find_all("span"):
        # Take content
        val = span.text
        # Use class as variable name (add prefix)
        varname = "loc_" + span["class"][0]
        divdict[varname] = val

    # Get time
    time = div.find("div", class_ = "time").find("span").text.strip()
    divdict["time"] = time
    
    # Save as new entry in resdict
    resdict[len(resdict)] = divdict

# Look at resdict
resdict

{0: {'date': 'June 24, 2019 - June 26, 2019',
  'event_type': 'Conference & Events',
  'event_series': 'Other events',
  'url': 'http://www.bccp-berlin.de/events/all-events/events-detail/julian-harke-and-kevin-tran/',
  'title': 'DIW Graduate Center/BCCP Short Course on Web Scraping',
  'topic': '',
  'loc_name': 'DIW',
  'loc_address': 'Mohrenstr. 58, Room Anna J. Schwartz',
  'loc_zip': '10117',
  'loc_place': 'Berlin',
  'time': '09:30–12:30'},
 1: {'date': 'June 27, 2019',
  'event_type': 'Seminar',
  'event_series': 'Berlin Behavioral Economics Seminar',
  'url': 'http://www.bccp-berlin.de/events/all-events/events-detail/felix-holzmeister-university-of-innsbruck/',
  'title': 'Felix Holzmeister (University of Innsbruck)',
  'topic': 'Delegated decision making in finance',
  'loc_name': 'WZB',
  'loc_address': 'Reichpietschufer 50, Room B001',
  'loc_zip': '10785',
  'loc_place': 'Berlin',
  'time': '16:45–18:00'},
 2: {'date': 'July 01, 2019',
  'event_type': 'Seminar',
  'event_s

<h2>4. Convert the dictionary to a DataFrame</h2>

In [25]:
# Turn to dataframe and transpose
df = pd.DataFrame(resdict).T
# Look at df
df

Unnamed: 0,date,event_series,event_type,loc_address,loc_name,loc_place,loc_zip,time,title,topic,url
0,"June 24, 2019 - June 26, 2019",Other events,Conference & Events,"Mohrenstr. 58, Room Anna J. Schwartz",DIW,Berlin,10117.0,09:30–12:30,DIW Graduate Center/BCCP Short Course on Web S...,,http://www.bccp-berlin.de/events/all-events/ev...
1,"June 27, 2019",Berlin Behavioral Economics Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,16:45–18:00,Felix Holzmeister (University of Innsbruck),Delegated decision making in finance,http://www.bccp-berlin.de/events/all-events/ev...
2,"July 01, 2019",Berlin Applied Micro Seminar,Seminar,"Spandauer Str. 1, Room 22",Humboldt-Universität zu Berlin,Berlin,10178.0,16:00–17:15,Bettina Siflinger (Tilburg University),TBA,http://www.bccp-berlin.de/events/all-events/ev...
3,"July 01, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Ariel Rubinstein (Tel Aviv University),Normative Equilibrium:The permissible and the ...,http://www.bccp-berlin.de/events/all-events/ev...
4,"July 08, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Antonio Rosato (UT Sydney),Projection of Private Values in Auctions,http://www.bccp-berlin.de/events/all-events/ev...
5,"September 20, 2019",Berlin IO Day,Conference & Events,Straße des 17. Juni 135,Technische Universität Berlin,,,09:00–18:00,12th Berlin IO Day,,http://www.bccp-berlin.de/events/all-events/ev...
6,"October 21, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Jörgen Weibull (Stockholm School of Economics),TBA,http://www.bccp-berlin.de/events/all-events/ev...
7,"October 24, 2019",Berlin Behavioral Economics Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,16:45–18:00,Tom Chang (University of Southern California),TBA,http://www.bccp-berlin.de/events/all-events/ev...
8,"October 28, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B 002/003",WZB,Berlin,10785.0,17:15–18:30,Jidong Zhou (Yale),TBA,http://www.bccp-berlin.de/events/all-events/ev...
9,"October 31, 2019",Berlin Behavioral Economics Seminar,Seminar,"Ernst-Reuter-Platz 1, Room BH-N 243",Technische Universität Berlin,Berlin,10587.0,16:45–18:00,Tim Cason (Purdue University),TBA,http://www.bccp-berlin.de/events/all-events/ev...


<h2>5. Loop over the events, load the event detail pages, and save additional info</h2>
We saved the URLs to the detailed event pages in the "url" variable.
Now loop through the listings and save all additional details
we can find on the details page.

<h3>5.1 Let's look at an example first</h3>

In [26]:
# Load url and turn to soup
url = df["url"].values[0]

r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")

In [27]:
soup

<!DOCTYPE html>
<!--[if class="no-js">  <html class="ie ie6 lte9 lte8 lte7 no-js" lang="de"> <![endif]--><!--[if IE 7]>     <html class="ie ie7 lte9 lte8 lte7 no-js" lang="de"> <![endif]--><!--[if IE 8]>     <html class="ie ie8 lte9 lte8 no-js" lang="de"> <![endif]--><!--[if IE 9]>     <html class="ie ie9 lte9 no-js" lang="de"> <![endif]--><!--[if gt IE 9]>  <html class="no-js" lang="de"> <![endif]--><!--[if !IE]><!--><html class="no-js" lang="de"><!--<![endif]-->
<head>
<meta charset="utf-8"/>
<!-- 
	CPS-IT GmbH http://www.cps-it.de/

	This website is powered by TYPO3 - inspiring people to share!
	TYPO3 is a free open source Content Management Framework initially created by Kasper Skaarhoj and licensed under GNU/GPL.
	TYPO3 is copyright 1998-2016 of Kasper Skaarhoj. Extensions are copyright of their respective owners.
	Information and contribution at http://typo3.org/
-->
<base href="http://www.bccp-berlin.de/"/>
<link href="/Templates/Master/Resources/Public/Images/favicon.ico" rel="

In [28]:
# Take contents of infobox
# Make it flexible, so we capture variouse types of fields
infobox = soup.find("div", class_ = "info-box")

In [30]:
[x for x in infobox.children]

['\n', <div class="b-events__multi-performance">
 <strong class="label">Dates</strong>
 <span class="date range" title="">June 24, 2019 - June 26, 2019</span>
 </div>, '\n', <div class="b-events__multi-performance__label">
 <strong class="label">Date Details</strong>
 </div>, '\n', <div class="performance__details">
 <div class="date">
 			
 			June 25, 2019
 			<span class="performance__details--trigger">Open Details</span>
 </div>
 <div class="performance__content">
 <div class="time">
 <strong class="label">Time</strong>
 <span>09:30–12:30</span>
 </div>
 <div class="location">
 <strong class="label">Location</strong>
 <div class="address">
 <span class="name">DIW</span>
 <span class="address">Mohrenstr. 58, Room Anna J. Schwartz</span>
 <span class="zip">10117</span>
 <span class="place">Berlin</span>
 </div>
 </div>
 </div>
 </div>, '\n', <div class="performance__details">
 <div class="date">
 			
 			June 26, 2019
 			<span class="performance__details--trigger">Open Details</span

In [31]:
# Save in dictionary
event_dict = {}
for child in infobox.children:
    ##### Skip if whitespace
    ### Note: strip() removed leading and trailing whitespaces
    # The advantage over e.g. child == "" here is that strip()
    # also capture other white spaces (e.g. tabs, line breaks)
    ### Further note: If child is not a string, the following if condition
    # yields a TypeError. Catching the exception here helps.
    try:
        if child.strip() == "":
            # continue loop with next iteration
            pass
    # Do something else if TypeError
    except TypeError:
        # The element now should have a "class" attribute
        
        ### Special case for the "location" element: Take the children of it
        if child["class"][0] == "location":
            # Take all span elements with address info
            spans = child.find("div", class_ = "address").find_all("span")
            # Loop and save
            for span in spans:
                # Create variable name using class name
                varname = "address_" + span["class"][0]
                # Get value
                # Note: This removes multiple whitespaces and replaces them
                # by a single space
                value = " ".join(span.text.strip().split())
                # Save
                event_dict[varname] = value
        # Take the class and check if it is called "label". If so, skip it
        elif child["class"][0] == "label":
            pass
        # Else, take the class as variable name and take the contents
        # as values
        else:
            varname = child["class"][0]
            value = " ".join(child.text.strip().split())
            event_dict[varname] = value
# Look at the event_dict
event_dict

{'b-events__multi-performance': 'Dates June 24, 2019 - June 26, 2019',
 'b-events__multi-performance__label': 'Date Details',
 'performance__details': 'June 24, 2019 Open Details Time 09:30–12:30 Location DIW Mohrenstr. 58, Room Anna J. Schwartz 10117 Berlin'}

In [32]:
# Next, take contents of main page
cont = soup.find("div", class_ = "content") \
    .find("div", class_ = "description")
# Loop through children
for child in cont.children:
    ##### Skip if whitespace
    ### Note: strip() removed leading and trailing whitespaces
    # The advantage over e.g. child == "" here is that strip()
    # also capture other white spaces (e.g. tabs, line breaks)
    ### Further note: If child is not a string, the following if condition
    # yields a TypeError. Catching the exception here helps.
    try:
        if child.strip() == "":
            # continue loop with next iteration
            pass
    # Do something else if TypeError
    except TypeError:
        # The element now should have a "class" attribute
        
        ### Special case for the "location" element: Take the children of it
        if child["class"][0] == "label":
            pass
        # Else, take the class as variable name and take the contents
        # as values
        else:
            varname = child["class"][0]
            value = " ".join(child.text.strip().split())
            event_dict[varname] = value
            # Check if the element contains links, if so save
            links = child.find_all("a")
            # Loop through links and save
            for link in links:
                # Take content in lowercase and with underscore as variable name
                varname = "_".join(link.text.strip().lower().split())
                # URL
                url = link["href"]
                # Save
                event_dict[varname] = url

In [33]:

# Look at the event_dict
event_dict

{'b-events__multi-performance': 'Dates June 24, 2019 - June 26, 2019',
 'b-events__multi-performance__label': 'Date Details',
 'performance__details': 'June 24, 2019 Open Details Time 09:30–12:30 Location DIW Mohrenstr. 58, Room Anna J. Schwartz 10117 Berlin',
 'headline--desktop': 'DIW Graduate Center/BCCP Short Course on Web Scraping',
 'description__bodytext': 'This short course by BCCP Doctoral Students Julian Harke and Kevin Tran is meant to give an overview of the most common web scraping techniques. The idea is to have an interactive course in which the participants get their hands on actual code and work with it.You can find the course syllabus here.Time and place:June 24, 2019, 09:30 - 12:30, DIW, Anna-Schwartz-Room 5th floor no. 5.2.010June 25, 2019, 09:30 - 12:30, DIW, Anna-Schwartz-Room 5th floor no. 5.2.010June 26, 2019, 14:00 - 17:00, DIW, Anna-Schwartz-Room 5th floor no. 5.2.010To ensure an effective course, the number of participants will be limited. If you are interest

<h3>5.2 Now loop over all events</h3>
As before, we can combine the previous codes to loop over all event pages and save the details.

In [34]:
### Now do this for all URLs
# Save in large dictionary
resdict = {}
for url in df["url"].values:
    
    # Message to let us know where we are
    print("Loading %s" % url)

    r = requests.get(url)
    soup = BeautifulSoup(r.text, "lxml")
    
    # Take contents of infobox
    # Make it flexible, so we capture variouse types of fields
    infobox = soup.find("div", class_ = "info-box")

    # Save in dictionary
    event_dict = {}
    for child in infobox.children:
        ##### Skip if whitespace
        ### Note: strip() removed leading and trailing whitespaces
        # The advantage over e.g. child == "" here is that strip()
        # also capture other white spaces (e.g. tabs, line breaks)
        ### Further note: If child is not a string, the following if condition
        # yields a TypeError. Catching the exception here helps.
        try:
            if child.strip() == "":
                # continue loop with next iteration
                pass
        # Do something else if TypeError
        except TypeError:
            # The element now should have a "class" attribute

            ### Special case for the "location" element: Take the children of it
            if child["class"][0] == "location":
                # Take all span elements with address info
                spans = child.find("div", class_ = "address").find_all("span")
                # Loop and save
                for span in spans:
                    # Create variable name using class name
                    varname = "address_" + span["class"][0]
                    # Get value
                    # Note: This removes multiple whitespaces and replaces them
                    # by a single space
                    value = " ".join(span.text.strip().split())
                    # Save
                    event_dict[varname] = value
            # Take the class and check if it is called "label". If so, skip it
            elif child["class"][0] == "label":
                pass
            # Else, take the class as variable name and take the contents
            # as values
            else:
                varname = child["class"][0]
                value = " ".join(child.text.strip().split())
                event_dict[varname] = value
    # Next, take contents of main page
    cont = soup.find("div", class_ = "content") \
        .find("div", class_ = "description")
    # Loop through children
    for child in cont.children:
        ##### Skip if whitespace
        ### Note: strip() removed leading and trailing whitespaces
        # The advantage over e.g. child == "" here is that strip()
        # also capture other white spaces (e.g. tabs, line breaks)
        ### Further note: If child is not a string, the following if condition
        # yields a TypeError. Catching the exception here helps.
        try:
            if child.strip() == "":
                # continue loop with next iteration
                pass
        # Do something else if TypeError
        except TypeError:
            # The element now should have a "class" attribute

            ### Special case for the "location" element: Take the children of it
            if child["class"][0] == "label":
                pass
            # Else, take the class as variable name and take the contents
            # as values
            else:
                varname = child["class"][0]
                value = " ".join(child.text.strip().split())
                event_dict[varname] = value
                # Check if the element contains links, if so save
                links = child.find_all("a")
                # Loop through links and save
                for link in links:
                    # Take content in lowercase and with underscore as variable name
                    varname = "link_" + "_".join(link.text.strip().lower().split())
                    # URL
                    linkurl = link["href"]
                    # Save
                    event_dict[varname] = linkurl
                    
    # Save in resdict and use url as index (we use this later to merge to
    # previously created df)
    resdict[url] = event_dict
# Create DataFrame
df_details = pd.DataFrame(resdict).T
# Look at it
df_details

Loading http://www.bccp-berlin.de/events/all-events/events-detail/julian-harke-and-kevin-tran/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/felix-holzmeister-university-of-innsbruck/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/bettina-siflinger-tilburg-university/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/ariel-rubinstein-tel-aviv-university-2/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/antonio-rosato-ut-sydney-1/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/12th-berlin-io-day/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/joergen-weibull-stockholm-school-of-economics-1/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/tom-chang-university-of-southern-california/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/jidong-zhou-yale-som-1/
Loading http://www.bccp-berlin.de/events/all-events/events-detail/tim-cason-purdue-univers

Unnamed: 0,address_address,address_name,address_place,address_zip,b-events__multi-performance,b-events__multi-performance__label,date,description__bodytext,headline--desktop,link_here,link_speaker_website,performance__details,teaser,time
http://www.bccp-berlin.de/events/all-events/events-detail/julian-harke-and-kevin-tran/,,,,,"Dates June 24, 2019 - June 26, 2019",Date Details,,This short course by BCCP Doctoral Students Ju...,DIW Graduate Center/BCCP Short Course on Web S...,/fileadmin/user_upload/syllabus_webscraping.pdf,,"June 24, 2019 Open Details Time 09:30–12:30 Lo...",,
http://www.bccp-berlin.de/events/all-events/events-detail/felix-holzmeister-university-of-innsbruck/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"June 27, 2019",Go to speaker website.,Felix Holzmeister (University of Innsbruck),,https://www.holzmeister.biz/,,Topic:Delegated decision making in finance,Time 16:45–18:00
http://www.bccp-berlin.de/events/all-events/events-detail/bettina-siflinger-tilburg-university/,"Spandauer Str. 1, Room 22",Humboldt-Universität zu Berlin,Berlin,10178.0,,,"July 01, 2019",Go to speaker website.,Bettina Siflinger (Tilburg University),,https://sites.google.com/site/bettinasiflinger/,,Topic:TBA,Time 16:00–17:15
http://www.bccp-berlin.de/events/all-events/events-detail/ariel-rubinstein-tel-aviv-university-2/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"July 01, 2019",Go to speaker website.,Ariel Rubinstein (Tel Aviv University),,https://en-social-sciences.tau.ac.il/profile/r...,,Topic:Normative Equilibrium:The permissible an...,Time 17:15–18:30
http://www.bccp-berlin.de/events/all-events/events-detail/antonio-rosato-ut-sydney-1/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"July 08, 2019",Go to speaker website.,Antonio Rosato (UT Sydney),,https://sites.google.com/site/rosatoeconomics/...,,Topic:Projection of Private Values in Auctions,Time 17:15–18:30
http://www.bccp-berlin.de/events/all-events/events-detail/12th-berlin-io-day/,Straße des 17. Juni 135,Technische Universität Berlin,,,,,"September 20, 2019",,12th Berlin IO Day,,,,,Time 09:00–18:00
http://www.bccp-berlin.de/events/all-events/events-detail/joergen-weibull-stockholm-school-of-economics-1/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"October 21, 2019",Go to speaker website.,Jörgen Weibull (Stockholm School of Economics),,https://sites.google.com/site/joergenweibull/,,Topic:TBA,Time 17:15–18:30
http://www.bccp-berlin.de/events/all-events/events-detail/tom-chang-university-of-southern-california/,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,,,"October 24, 2019",Go to speaker website.,Tom Chang (University of Southern California),,https://www.tomychang.com,,Topic:TBA,Time 16:45–18:00
http://www.bccp-berlin.de/events/all-events/events-detail/jidong-zhou-yale-som-1/,"Reichpietschufer 50, Room B 002/003",WZB,Berlin,10785.0,,,"October 28, 2019",Go to speaker website.,Jidong Zhou (Yale),,https://sites.google.com/site/jidongzhou77/,,Topic:TBA,Time 17:15–18:30
http://www.bccp-berlin.de/events/all-events/events-detail/tim-cason-purdue-university/,"Ernst-Reuter-Platz 1, Room BH-N 243",Technische Universität Berlin,Berlin,10587.0,,,"October 31, 2019",Go to speaker website.,Tim Cason (Purdue University),,https://krannert.purdue.edu/faculty/cason/home...,,Topic:TBA,Time 16:45–18:00


In [35]:
### Merge back to df
# Use "url" for df and index for df_details
# Note: The suffixes attribute adds a "_details" to all variables
# from df_details whose name already exists in df (e.g. date)
df = df.merge(df_details, left_on = "url", right_index = True, \
    validate = "1:1", how = "left", suffixes = ("", "_details"))

# Look at it
df

Unnamed: 0,date,event_series,event_type,loc_address,loc_name,loc_place,loc_zip,time,title,topic,...,b-events__multi-performance,b-events__multi-performance__label,date_details,description__bodytext,headline--desktop,link_here,link_speaker_website,performance__details,teaser,time_details
0,"June 24, 2019 - June 26, 2019",Other events,Conference & Events,"Mohrenstr. 58, Room Anna J. Schwartz",DIW,Berlin,10117.0,09:30–12:30,DIW Graduate Center/BCCP Short Course on Web S...,,...,"Dates June 24, 2019 - June 26, 2019",Date Details,,This short course by BCCP Doctoral Students Ju...,DIW Graduate Center/BCCP Short Course on Web S...,/fileadmin/user_upload/syllabus_webscraping.pdf,,"June 24, 2019 Open Details Time 09:30–12:30 Lo...",,
1,"June 27, 2019",Berlin Behavioral Economics Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,16:45–18:00,Felix Holzmeister (University of Innsbruck),Delegated decision making in finance,...,,,"June 27, 2019",Go to speaker website.,Felix Holzmeister (University of Innsbruck),,https://www.holzmeister.biz/,,Topic:Delegated decision making in finance,Time 16:45–18:00
2,"July 01, 2019",Berlin Applied Micro Seminar,Seminar,"Spandauer Str. 1, Room 22",Humboldt-Universität zu Berlin,Berlin,10178.0,16:00–17:15,Bettina Siflinger (Tilburg University),TBA,...,,,"July 01, 2019",Go to speaker website.,Bettina Siflinger (Tilburg University),,https://sites.google.com/site/bettinasiflinger/,,Topic:TBA,Time 16:00–17:15
3,"July 01, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Ariel Rubinstein (Tel Aviv University),Normative Equilibrium:The permissible and the ...,...,,,"July 01, 2019",Go to speaker website.,Ariel Rubinstein (Tel Aviv University),,https://en-social-sciences.tau.ac.il/profile/r...,,Topic:Normative Equilibrium:The permissible an...,Time 17:15–18:30
4,"July 08, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Antonio Rosato (UT Sydney),Projection of Private Values in Auctions,...,,,"July 08, 2019",Go to speaker website.,Antonio Rosato (UT Sydney),,https://sites.google.com/site/rosatoeconomics/...,,Topic:Projection of Private Values in Auctions,Time 17:15–18:30
5,"September 20, 2019",Berlin IO Day,Conference & Events,Straße des 17. Juni 135,Technische Universität Berlin,,,09:00–18:00,12th Berlin IO Day,,...,,,"September 20, 2019",,12th Berlin IO Day,,,,,Time 09:00–18:00
6,"October 21, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,17:15–18:30,Jörgen Weibull (Stockholm School of Economics),TBA,...,,,"October 21, 2019",Go to speaker website.,Jörgen Weibull (Stockholm School of Economics),,https://sites.google.com/site/joergenweibull/,,Topic:TBA,Time 17:15–18:30
7,"October 24, 2019",Berlin Behavioral Economics Seminar,Seminar,"Reichpietschufer 50, Room B001",WZB,Berlin,10785.0,16:45–18:00,Tom Chang (University of Southern California),TBA,...,,,"October 24, 2019",Go to speaker website.,Tom Chang (University of Southern California),,https://www.tomychang.com,,Topic:TBA,Time 16:45–18:00
8,"October 28, 2019",Berlin Micro Theory Seminar,Seminar,"Reichpietschufer 50, Room B 002/003",WZB,Berlin,10785.0,17:15–18:30,Jidong Zhou (Yale),TBA,...,,,"October 28, 2019",Go to speaker website.,Jidong Zhou (Yale),,https://sites.google.com/site/jidongzhou77/,,Topic:TBA,Time 17:15–18:30
9,"October 31, 2019",Berlin Behavioral Economics Seminar,Seminar,"Ernst-Reuter-Platz 1, Room BH-N 243",Technische Universität Berlin,Berlin,10587.0,16:45–18:00,Tim Cason (Purdue University),TBA,...,,,"October 31, 2019",Go to speaker website.,Tim Cason (Purdue University),,https://krannert.purdue.edu/faculty/cason/home...,,Topic:TBA,Time 16:45–18:00


In [36]:
###############################################################################
################################# 6. SAVE CSV #################################
###############################################################################

# Save csv
df.to_csv(savefile, sep = ";", encoding = "utf-8-sig")