# IGM Economic Experts Panel Data Collection
## Oliver Gladfelter
### Jan 6th, 2020

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import os
import re

## 1) Open http://www.igmchicago.org/igm-economic-experts-panel and save web page as an HTML file
## 2) Access HTML file and scrape each month's publish date, title, and url. Save csv file.

In [3]:
with open("IGM Economic Experts Panel _ IGM Forum.html", encoding='utf8') as f:
    
    contents = f.read()

    bs = BeautifulSoup(contents, 'lxml')
    
divs = (bs.findAll('div', {"class":["poll-listing poll-results", "post poll-results"]}))

dateTime = []
url = []
title = []

for div in divs:
    dateTime.append(div.find('h6').text)
    url.append(div.find('a')['href'])
    title.append(div.find('h2').text)
    
df = pd.DataFrame({'date':dateTime, 'title': title, 'url':url})

df.to_csv("output-data\\IGMPanelLinks.csv", index=False)

## 3) All survey questions' web pages also need to be saved as HTML files. We stored them in the 'survey-html-files' folder. 
## 4) Open all files in the folder, scrape the answers data, export csv

In [5]:
path = 'survey-html-files'

filenames = []

for filename in os.listdir(path):
    filenames.append(filename)
    
len(filenames)

196

In [8]:
data = [] # add finished dataframes to this list

for filename in os.listdir(path):

    with open("survey-html-files\\" + filename, encoding='utf8') as f:

        contents = f.read()

        bs = BeautifulSoup(contents, 'lxml')

    questionList = []
    dateTimeList = []
    nameList = []
    voteList = []
    confidenceList = []

    # scrape the date
    date = bs.findAll('h6')[0].text

    # scrape the questions
    questions = bs.findAll('h3', {'class':'surveyQuestion'})

    # append each question to list, after removing new lines and leading space
    questionText = []
    for question in questions:
        questionText.append(re.sub("\n", "", question.text).strip(" "))

    # determine how many tables there are (should be equal to number of questions)
    numTables = len(bs.findAll('table'))

    # for each table...
    for table in range(0,numTables):

        # find all the rows containing data    
        tableRows = bs.findAll('table')[table].findAll('tr', {"class":"parent-row"})

        # for each row...
        for row in tableRows:
            columns = row.findAll('td') # select all columns
            nameList.append(columns[0].text.strip("\n").strip("\t")) # pull name
            voteList.append(columns[2].text.strip("\n").strip("\t")) # pull vote
            confidenceList.append(columns[3].text.strip("\n").strip("\t")) # pull vote

            questionList.append(questionText[table]) # add corresponding question to list
            dateTimeList.append(date)

    # convert lists to dataframe and append to data list
    data.append(pd.DataFrame({'question':questionList, 'date':dateTimeList, 'name':nameList, 'vote':voteList, 'confidence':confidenceList}))

# convert lists of dataframes into one dataframe
data = pd.concat(data)

In [281]:
data.to_csv("output-data\\answers.csv", index=False)

# saving data in pivot form
dataPivot = data.drop_duplicates(['question', 'name']).pivot(index='name', columns='question', values='vote').reset_index()
dataPivot.to_csv("output-data\\answersAsColumns.csv", index=False)