This notebook does topic modeling on EGU abstracts from the 2018 EGU meeting, the main premise is that the PDF files are organized in the same way, first the EGU copyright notice, then the abstract title, then the authors and lastly the abstract content.

We parse the PDFs using PDFMiner's utility pdf2txt

```sh
ls *.pdf | xargs -n1 -P8 bash -c 'pdf2txt.py -o output/$0.txt -t text $0'
```

The current notebook uses abstracts from the atmospheric science interest group (AS) thus the modeling is not going to be illustrative of the EGU conference as a whole but rather a sub topic modelling exercise.


In [1]:
# We need to download nltk's wordnet first
import nltk
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import json
import glob

[nltk_data] Downloading package wordnet to /home/beto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# We create a list of objects containing the title, the entities and the abstract.
# For now we only use the abstract content but later we can do NER training with the corpus using the entities field
files_path = './abstracts/output/*.txt'
files = glob.glob(files_path)
documents = []
for file in files:
    try:
        with open(file) as f:
            data = f.read()
            # We split the paragraphs
            item = data.split('\n\n')
            title = item[1]
            # Cleaning the entities, i.e. "John Dow, (1) U of Colorado -> 'John dow','U of colorado'"
            entities = item[2].replace('(', '').replace(')', '').replace('\n', ',').split(',')
            entities = [e for e in entities if len(e)>2]
            entities = [''.join([i for i in s if not i.isdigit()]) for s in entities]
            abstract = item[3].replace('\n', ' ')
            doc = {
                'file': f.name,
                'title' : title,
                'entities': entities,
                'abstract': abstract
            }
            documents.append(doc)
    except IOError as exc: #Not sure what error this is
        if exc.errno != errno.EISDIR:
            raise

In [16]:
## we need a tokenizer
tokenizer = RegexpTokenizer(r'\w+')
## we need stemer
stemmer = WordNetLemmatizer()
## our custom stop words
my_stop_words = {
                    'http', 'www', 'area', 'time', 'measurement', 'data', 'event', 'service',
                    'group', 'research', 'study', 'use', 'work', 'member', 'case',
                    'meeting', 'news', 'model', 'project', 'standard',
                    'statement', 'school', 'university', 'output', 'brokering',
                    'repository', 'user', 'citation', 'chair', 'framework', 'information',
                    'metadata', 'content', 'sharing', 'pid'
                }
stop_words = my_stop_words.union(ENGLISH_STOP_WORDS)

# document list will contain our corpus after cleaning it.
document_list = []
# pairs is a list of the urls and the size of their content
pairs = []
# just the documents urls
urls = []

def clean_document(doc):
    tokens = tokenizer.tokenize((doc).lower())
    # We lematize (stemming)
    stemmed_tokens = [stemmer.lemmatize(i) for i in tokens]
    # If the token is not in our stop words and the length is >2 and <20 we add it to the cleaned document
    document = [i for i in stemmed_tokens if i not in stop_words and (len(i) > 2 and len(i) < 25)]
    return document

for doc in documents:
    document = clean_document(doc['abstract'])
    document_list.append(document)

In [17]:
num_passes = 5
num_topics = 8
words_per_topic = 6

dictionary = corpora.Dictionary(document_list)
corpus = [dictionary.doc2bow(text) for text in document_list]
lda_model = models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=num_passes)
topics = lda_model.print_topics(num_topics=num_topics, num_words=words_per_topic)
# Now let's print the topics found
for topic in topics:
    print(topic)

(0, '0.010*"solar" + 0.007*"mission" + 0.007*"hail" + 0.006*"particle" + 0.006*"sep" + 0.004*"surface"')
(1, '0.005*"surface" + 0.005*"huygens" + 0.005*"2017" + 0.004*"lightning" + 0.004*"reconnection" + 0.004*"region"')
(2, '0.012*"climate" + 0.008*"change" + 0.008*"surface" + 0.007*"temperature" + 0.006*"result" + 0.005*"emission"')
(3, '0.007*"climate" + 0.006*"ocean" + 0.005*"earth" + 0.005*"high" + 0.005*"impact" + 0.005*"global"')
(4, '0.009*"emission" + 0.006*"wave" + 0.006*"high" + 0.005*"scale" + 0.005*"region" + 0.004*"atmospheric"')
(5, '0.019*"ozone" + 0.008*"ice" + 0.007*"surface" + 0.004*"space" + 0.004*"solar" + 0.004*"nucleation"')
(6, '0.008*"urban" + 0.007*"surface" + 0.006*"change" + 0.005*"deposition" + 0.005*"land" + 0.005*"temperature"')
(7, '0.008*"cre" + 0.006*"forecast" + 0.006*"ﬁeld" + 0.005*"monsoon" + 0.005*"region" + 0.004*"debris"')
