# Paper Abstracts Analysis

By Rafael Ballestiero

In [143]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Change the PATH variable to the folder where the CSV file is stored.
PATH = '/Users/rafa/dv/INSEAD/abstracts'
FILENAME = 'papers.csv'

os.chdir(PATH)

## 1. Get Abstracts for Paper

The first step in this is to parse the DOI links provided in the spreadsheet to retrieve the Abstracts of each of these players. This is done with the help of some HTML parsing libraries and a quick inspection of the page structure to find the HTML elements in which the abstracts appear.

In [None]:
import requests
import urllib.request
from bs4 import BeautifulSoup

paper_urls = pd.read_csv(FILENAME, header=None)[0].tolist()

In [150]:
def retrieve_abstract(url):
    response = requests.get(url)
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    abstract_section = soup.find("div", {"class": "abstractSection"})
    return abstract_section.p.get_text()

Once we have done this we can store this back into a CSV to prevent repeated scraping in further runs.

In [151]:
if 'abstracts.csv' in os.listdir():
    print('Reading abstracts from CSV')
    df = pd.read_csv('abstracts.csv', index_col=0)
else:
    abstracts = {}

    for i, url in enumerate(paper_urls):
        print(f'({i}/{len(paper_urls)})Retrieving abstract for {url}...')
        abstracts[url] = retrieve_abstract(url)

    df = pd.DataFrame.from_dict(abstracts, orient='index', columns=['Abstract'])

    # Ensure integrity of abstracts (none of them are null)
    assert (df['Abstract'].isna() == False).all()

    df.to_csv('abstracts.csv')
    
df.columns = ['URL', 'Abstract']

Reading abstracts from CSV


## 2. Clean Abstracts

The text needs to be cleaned before our model is trained. 

In [152]:
from gensim.utils import simple_preprocess, lemmatize

import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/rafa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### A. Preprocess Abstracts

In [153]:
corpus = df['Abstract'].apply(simple_preprocess)

### B. Filter Out Stopwords

In [154]:
stopwords = set(nltk.corpus.stopwords.words('english'))

def filter_stopwords(sentence):
    result = []
    
    for word in sentence:
        if word not in stopwords:
            result.append(word)
            
    return result

In [155]:
corpus = corpus.apply(filter_stopwords)

## 3. Train Model

In [156]:
from gensim.models import Word2Vec, Doc2Vec

In [161]:
model = Word2Vec(corpus, size=200)

model.save('abstracts_model')

In [174]:
words = model.wv.index2word
wvs = model.wv[words]

## 4. Cluster Words

In [166]:
from sklearn.cluster import KMeans, AffinityPropagation