# Data scraping notebook

To save time manually downloading data of different websites, I used Selenium webdriver and Beautfoulsoup packages to parce and process HTML into Pandas DataFrames.

## Import libraries

In [1]:
from selenium import webdriver
import bs4
import numpy as np
import pandas as pd

In [2]:
# Initiate webdriver with wich data will be scraped
driver = webdriver.Firefox(executable_path=r"geckodriver.exe")

# Count recent election dates from various countries

In [3]:
def get_election_dates(soup, country):
    """
    Input:
        soup- BS4 soup element
        country- string, country name
    Output:
        list- each list element is a list [Year, Election Type, Country]
    """
    election_types = {1: "Parlament", 2: "Presidential", 3: "Euro Parlament", 4: "Referendums"}

    _data_list = list()
    rows = soup.find("div", {"id": "sidebar"}).table.tbody.findAll("tr")
    for row in rows:
        cols = row.findAll('td')
        for i in range(5):
            try:
                # check if img is present in column element
                cols[i].a.img
                # print election type
                e_type = election_types[i]
                e_year = int(cols[0].text)
                _data_list.append([e_year, e_type, country])
            except:
                continue
                
    return _data_list

In [5]:
country_dict = {"France": "http://www.nsd.uib.no/european_election_database/country/france/",
               "Finland": "http://www.nsd.uib.no/european_election_database/country/finland/",
               "Lithuania": "http://www.nsd.uib.no/european_election_database/country/lithuania/",
               "Austria": "http://www.nsd.uib.no/european_election_database/country/austria/"}

data_list = list()
# iterate over country dictonary keys
for country_name in country_dict.keys():
    # load neccesary page
    driver.get(country_dict[country_name])
    # read election years and types for each country
    soup = bs4.BeautifulSoup(driver.page_source)
    data_list += get_election_dates(soup, country_name)
df = pd.DataFrame(data_list, columns=["Year", "Election type", "Country"])
df.to_csv("election_years.csv")
df.head()

Unnamed: 0,Year,Election type,Country
0,1992,Referendums,France
1,1993,Parlament,France
2,1994,Euro Parlament,France
3,1995,Presidential,France
4,1997,Parlament,France


## Turn off webdriver

In [6]:
driver.quit()