# An Analysis of Trends in Economic Research

**Quentin Batista**

In this notebook, I scrape and analyze data from the National Bureau of Economic Research (NBER) working papers to identify trends in economic research. This analysis follows-up on an [article](https://www.economist.com/finance-and-economics/2016/11/24/economists-are-prone-to-fads-and-the-latest-is-machine-learning) by The Economist which claimed that the use of Machine Learning techniques in Economics was a fad.

In [5]:
!date

2018年 10月 4日 木曜日 11時31分25秒 EDT


## Data Scraping

I fetch the data for NBER working paper abstracts through the program archive (http://www.nber.org/papersbyprog/) using `BeautifulSoup`. Each program archive contains two links: one for recent papers and one for older papers.

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen

program_url = 'http://www.nber.org/papersbyprog/'
programs_data = pd.DataFrame(columns=['program', 'main_url', 'archive_url', 'number_of_papers'])

# Fetch content of program's webpage
soup = BeautifulSoup(urlopen(program_url).read(), 'html.parser')
table = soup.findAll('table', {'border': 0, 'cellspacing': 1, 'cellpadding': 4})[0]

# Get strings of text
strings = list(table.stripped_strings)
programs_data['program'] = strings[0::2] 
programs_data['number_of_papers'] = \
pd.Series(strings[1::2]).str.extract('(\d+)', expand=False).astype(int)

# Get url for program
for row in table.findAll('a'):
    programs_data.loc[programs_data['program'] == row.text, 'main_url'] = program_url + row['href']
    programs_data.loc[programs_data['program'] == row.text, 'archive_url'] = program_url + row['href'].replace('.', '_archive.')
    

In [2]:
programs_data

Unnamed: 0,program,main_url,archive_url,number_of_papers
0,Aging,http://www.nber.org/papersbyprog/AG.html,http://www.nber.org/papersbyprog/AG_archive.html,1382
1,Asset Pricing,http://www.nber.org/papersbyprog/AP.html,http://www.nber.org/papersbyprog/AP_archive.html,2260
2,Corporate Finance,http://www.nber.org/papersbyprog/CF.html,http://www.nber.org/papersbyprog/CF_archive.html,1864
3,Children,http://www.nber.org/papersbyprog/CH.html,http://www.nber.org/papersbyprog/CH_archive.html,1372
4,Development of the American Economy,http://www.nber.org/papersbyprog/DAE.html,http://www.nber.org/papersbyprog/DAE_archive.html,1457
5,Development Economics,http://www.nber.org/papersbyprog/DEV.html,http://www.nber.org/papersbyprog/DEV_archive.html,811
6,Economics of Education,http://www.nber.org/papersbyprog/ED.html,http://www.nber.org/papersbyprog/ED_archive.html,1285
7,Environment and Energy Economics,http://www.nber.org/papersbyprog/EEE.html,http://www.nber.org/papersbyprog/EEE_archive.html,981
8,Economic Fluctuations and Growth,http://www.nber.org/papersbyprog/EFG.html,http://www.nber.org/papersbyprog/EFG_archive.html,4975
9,Health Care,http://www.nber.org/papersbyprog/HC.html,http://www.nber.org/papersbyprog/HC_archive.html,1350


In [192]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import Legend
from bokeh.palettes import Magma256
from bokeh.models import ColumnDataSource
from bokeh.transform import factor_cmap

output_notebook()

programs_data['paper_frequency'] = programs_data.number_of_papers / programs_data.number_of_papers.sum() * 100

TOOLS = "crosshair,pan,wheel_zoom,reset,tap,save"

p = figure(tools=TOOLS, x_range=programs_data.program, plot_width=900, plot_height=600, toolbar_location="above")

source = ColumnDataSource(data=programs_data)

p.vbar(x='program', top='paper_frequency', width=0.9, source=source, line_color='white', 
          fill_color=factor_cmap('program', palette=Magma256[0::12], factors=programs_data.program))

p.y_range.start = 0
p.x_range.range_padding = 0.05
p.xgrid.grid_line_color = None
p.yaxis.axis_label = "% of Total Papers"
p.xaxis.axis_label = "Program Name"
p.xaxis.major_label_orientation = 1.2
p.outline_line_color = None

show(p)

Note: papers appearing in different categories are counted multiple times.

In [3]:
# Total number of papers (including duplicates)
sum(programs_data.number_of_papers)

45013

In [4]:
papers_data = pd.DataFrame(columns=['id', 'title', 'authors', 'program', 'url', 'year', 'abstract'])

# Get ID and URL for papers in each program
for papers_url in programs_data[['main_url', 'archive_url']].values.ravel():
    soup = BeautifulSoup(urlopen(papers_url).read(), 'html.parser')
    table = soup.findAll('table', {'border': 0, 'cellspacing': 1,'cellpadding': 4})[0]

    for row in table.findAll('a'):
        row_index = len(papers_data)
        papers_data.loc[row_index, 'id'] = row.text
        papers_data.loc[row_index, 'url'] = row['href']

In [6]:
# Drop duplicates and papers w/o URL
papers_data = papers_data.drop_duplicates()
papers_data = papers_data[papers_data.url != '']

# Retain only the IDs which start by 'w' or 't' followed by a digit.
# This is used because other IDs do not appear to contain abstracts.
papers_data = papers_data[papers_data.id.str.contains(' [tw][0-9]')].reset_index(drop=True)

# Total number of unique papers
papers_data.shape[0]

25119

In [40]:
import re

# Get abstract and other info for each paper
# Errors can sometimes occur -- this cell can be run multiple times until all the data has been scraped
for i, url in enumerate(papers_data.url):
    if papers_data.loc[i, "abstract"] is np.nan:
        soup = BeautifulSoup(urlopen(url).read(), 'html.parser')
        table = soup.findAll('table', {'id': 'mainTable'})[0]
        abstract = soup.findAll('p', {'style': 'margin-left: 40px; margin-right: 40px; text-align: justify'}, text=True)

        text = ''
        for paragraph in abstract:
            text += paragraph.text
        papers_data.loc[i, 'abstract'] = text

        papers_data.loc[i, 'title'] = table.findAll('h1', {'class': 'title'})[0].text
        papers_data.loc[i, 'authors'] = table.findAll('h2', {'class': 'bibtop'})[0].text
        temp_str = table.findAll('p', {'class': 'bibtop'})[0].text
        papers_data.loc[i, 'program'] = re.search(':(.*)', temp_str).group(1)
        papers_data.loc[i, 'year'] = re.search('Issued in (.*)', temp_str).group(1)

In [None]:
papers_data['year'] = papers_data['year'].str.split(',').str[0].str.extract('(\d+)').astype(np.int)

## Analysis

I considered two approaches here. The first was to use an approximate string matching technique such as [Levenhstein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) to realize the matches using the original keywords in the article's chart. The problem is that this technique is not sufficiently powerful for this application.This is particularly evident for the string 'dynamic stochastic general equilibrium' which is closer to 'equilibrium' than to 'dsge'. As such, using this technique would give meaningless results for this string. Additionally, even though strings such as 'neural nets' and 'machine learning' are closely related from a semantical point of view, they are far away from each other based on Levenhstein distance. As such, I resorted to using a dictionary containing different variants of keywords of interest, and count how often they appear in abstracts.

In [213]:
papers_data = pd.read_csv('new_abstract_data.csv', index_col=0)

In [214]:
papers_data['abstract'] = papers_data['abstract'].str.lower()
papers_data['title'] = papers_data['title'].str.lower()

In [215]:
techniques = {'diff-in-diff': ['difference in difference', 'differences in difference', 'difference-in-difference',
                                        'differences-in-difference', 'diff-in-diff', 'diff in diff'],
                     'regression discontinuity': ['regression discontinuity'],
                     'dynamic stochastic general equilibrium': ['dynamic stochastic general equilibrium', 'dsge', 'sdge',
                                                                                   'stochastic dynamic general equilibrium'],
                     'randomized controlled trial': ['randomized controlled trial', 'rct'],
                     'laboratory experiments': ['laboratory', 'lab experiment'],
                     'machine learning': ['machine learning', 'big data', 'deep learning', 'neural network',
                                                  'supervised learning', 'unsupervised learning', 'random forest', 
                                                  'reinforcement learning', 'neural net', 'lasso', 'decision tree',
                                                  'semi-supervised learning'],
                    'artificial intelligence': ['artificial intelligence'],
                    'vector autoregression': ['vector autoregression', 'var ', 'vector auto regression'],
                    'structural estimation': ['structural estimation'],
                    'rational expectation': ['rational expectation'],
                    'bounded rationality': ['bounded rationality', 'boundedly rational', 'behavioral economics']}


In [216]:
for technique in techniques.keys():
    papers_data[technique] = 0
    for key in techniques[technique]:
        papers_data[technique] += papers_data['abstract'].str.contains(key)
        papers_data[technique] += papers_data['title'].str.contains(key)
    papers_data[technique] = papers_data[technique] > 0

In [217]:
grouped = papers_data.groupby('year')
summary_stats = grouped.sum()
summary_stats = summary_stats.divide(grouped.size(), axis='rows') * 100
moving_average = summary_stats.rolling(window=3, center=False).mean()

In [219]:
lines = []
legend_it = []

colors = all_palettes['Category20'][len(moving_average.columns)]

p = figure(tools=TOOLS, plot_width=900, plot_height=600, toolbar_location="above",
                title='Mentions in NBER abstracts, % of 3-year moving average')

for (i, col) in enumerate(moving_average.columns): 
    temp_line = p.line(x=moving_average.index.values,
                                     y=moving_average[col],
                                     color=colors[i])
    legend_it.append((col, [temp_line]))
    
legend = Legend(items=legend_it, location=(10, 200))
legend.click_policy = "hide"


p.add_layout(legend, 'right')

show(p)

- Word embedding