In [1]:
import nltk
import numpy as np
import os
import pandas as pd
import pickle
import re
import spacy
import unicodedata

In [2]:
# This is a pared-down normalization since I want to use spaCy to parse the text, and it requires
# certain standard preprocessing steps NOT be performed (i.e., lowercasing words).

def normalize(text):
    '''Convert to ascii, remove special characters associated with LaTeX when given a df column'''
    normalized_text = []
    
    for t in text:
        t = unicodedata.normalize('NFKD', t).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        t = re.sub('\\\\', ' ', t)
        t = re.sub('[${}]', '', t)
        normalized_text.append(t)
    
    return normalized_text

# This function is to remove punctuation after spaCy parsing.

def remove(token):
    '''Provide feedback on whether a token is punctuation, whitespace, or stopword'''
    return token.is_punct or token.is_space or token.is_stop

In [3]:
# Set up spaCy's English abilities
nlp = spacy.load('en')

In [4]:
file = os.path.join('..', 'data', 'astro_raw_137k')

file

'..\\data\\astro_raw_137k'

First, import the journal abstracts.  Second, drop the 'journal_ref' column since it means nothing (all values = 'No journal reference found'), then drop the 'id' and 'pdf_url' columns since 'url' contains the same information.  

In [5]:
df = pd.read_csv(file, index_col=0, nrows=50000).reset_index(drop=True)
df.drop(['journal_ref', 'id', 'pdf_url'], axis=1, inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
abstract        50000 non-null object
authors         50000 non-null object
comment         50000 non-null object
main_author     50000 non-null object
publish_date    50000 non-null object
term            50000 non-null object
terms           50000 non-null object
title           50000 non-null object
update_date     50000 non-null object
url             50000 non-null object
dtypes: object(10)
memory usage: 3.8+ MB


Now let's look at the records and what we have.  

First, unique abstracts.  Duplicates would be unexpected.  You wouldn't think that two abstracts would randomly be exactly the same.  You could, however, expect it if they were a reference to conference proceedings.  I'll look for that.  Hm...  It looks like there are cases might be explained by the same abstract being used for multiple conferences.  I'll drop the earlier version.

In [7]:
df.abstract.nunique()

49998

In [8]:
len(df.abstract) - df.abstract.nunique()

2

In [9]:
df.abstract.loc[df.abstract.str.contains('conference proceeding')]

4094     The IMAGINE Consortium aims to bring modeling ...
16388    Neutron stars are the endpoint of the life of ...
24620    In this conference proceeding, I discuss in de...
27709    Index of H.E.S.S. conference proceedings to th...
34075    As the density of matter increases, atomic nuc...
36813    It is understood that strong magnetic fields a...
39994    The LUX collaboration new results advance the ...
Name: abstract, dtype: object

In [10]:
dups = df.loc[df.duplicated('abstract')==True]
dups

Unnamed: 0,abstract,authors,comment,main_author,publish_date,term,terms,title,update_date,url
37026,The Telescope Array (TA) shows a 20$^{\circ}$ ...,"Noemie Globus, Denis Allard, Etienne Parizot, ...","28 pages, 19 figures, accepted in ApJ",Tsvi Piran,2016-10-17 20:00:16+00:00,astro-ph.HE,astro-ph.HE,Can we reconcile the TA excess and hotspot wit...,2017-01-26 23:48:38+00:00,http://arxiv.org/abs/1610.05319v2
37507,ImageJ is a graphical user interface (GUI) dri...,"Karen A. Collins, John F. Kielkopf, Keivan G. ...",Accepted by AJ,Frederic V. Hessman,2016-01-11 21:00:04+00:00,astro-ph.IM,astro-ph.IM|astro-ph.EP|astro-ph.SR,AstroImageJ: Image Processing and Photometric ...,2017-01-17 09:51:05+00:00,http://arxiv.org/abs/1601.02622v2


In [11]:
df.drop(dups.index,inplace=True)

One more check on the abstracts: how many of these papers have been withdrawn?  They need to be removed.

In [12]:
withdrawn = df.loc[df.abstract.str.contains('withdrawn')]
withdrawn

Unnamed: 0,abstract,authors,comment,main_author,publish_date,term,terms,title,update_date,url


In [13]:
len(withdrawn)

0

In [14]:
df.drop(withdrawn.index,inplace=True)

Now, on to other columns.  I'll skip over columns like 'author' where uniqueness can't help us.

Title duplicates are understandable, but they might need to be cleaned later.  I'll store them for now.

In [15]:
title_dups = df.loc[df.duplicated('title')==True]
len(title_dups)

31

Check to make sure that only the 6 subcategories are included as the primary category since astro-ph was deprecated in 2009.  Although arXiv has gone back and reclassified some of the articles in the 'terms' column, it's missed some and put gibberish in for others.  Drop the articles with the primary category 'astro-ph' that appear in the df now.  This classification won't help us map back to the six categories later.

In [16]:
df.term.loc[df.term.str.startswith('ast')].nunique()

7

In [17]:
len(df.loc[df.term=='astro-ph'])

25

In [18]:
deprecated_prim = df.term.loc[df.term=='astro-ph'].index
df.drop(deprecated_prim,inplace=True)

Finally, the info shows what we expect after dropping the excess columns, abstract duplicates, and withdrawn papers.  It also shows that there are no null values in the df.

In [19]:
df.reset_index(drop=True,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49973 entries, 0 to 49972
Data columns (total 10 columns):
abstract        49973 non-null object
authors         49973 non-null object
comment         49973 non-null object
main_author     49973 non-null object
publish_date    49973 non-null object
term            49973 non-null object
terms           49973 non-null object
title           49973 non-null object
update_date     49973 non-null object
url             49973 non-null object
dtypes: object(10)
memory usage: 3.8+ MB


### Time to clean everything up

Steps:
*  I won't try to strip html or keep LaTeX formatting.  These aren't real LaTeX documents that could be converted; they're snippets of LaTeX that represent equations and Greek letters.  The symbols could be useful in other circumstances but don't contribute value to this project.  
*  I will use unicodedata.normalize to convert special characters and a simple regex to remove the LaTeX leftovers.
*  I'll let spaCy lowercase, strip punctuation, and lemmatize the strings.

In [20]:
df['text'] = normalize(df.abstract)

In [21]:
sample_abs = df.text[32]
sample_abs

"We simulate the evolution of a dust universe from z=1089 to z=0 by numerically integrating the Einstein's equation for a spatially flat Friedmann-Lemaire-Robertson-Walker (FLRW) background spacetime with scalar perturbations which are derived from the matter power spectrum produced with the Code for Anisotropies in the Microwave Background (CAMB). To investigate the effects of primordial gravitational waves (GWs) on the inhomogeneity of the universe, we add an additional decaying, divergenceless and traceless primordial tensor perturbation with its initial amplitude being 3 times 10^-4 to the above metric. We find that this primordial tensor perturbation suppresses the matter power spectrum by about 0.01 % at z=0 for modes with wave number similar to its. This suppression may be a possible probe of a GWs background in the future."

In [22]:
parsed_abs = nlp(sample_abs)

In [23]:
token_text = [token.text for token in parsed_abs]
token_lem = [token.lemma_ for token in parsed_abs]
token_speech = [token.pos_ for token in parsed_abs]
token_ent = [token.ent_type_ for token in parsed_abs]
token_stop = [token.is_stop for token in parsed_abs]
pd.DataFrame(list(zip(token_text, token_lem, token_speech, token_ent, token_stop)), 
             columns=['token','lemma','part of speech','entity type','stopword'])

Unnamed: 0,token,lemma,part of speech,entity type,stopword
0,We,-PRON-,PRON,,False
1,simulate,simulate,VERB,,False
2,the,the,DET,,True
3,evolution,evolution,NOUN,,False
4,of,of,ADP,,True
5,a,a,DET,,True
6,dust,dust,NOUN,,False
7,universe,universe,NOUN,,False
8,from,from,ADP,,True
9,z=1089,z=1089,NOUN,,False


In [24]:
len(list(parsed_abs.sents))

4

In [25]:
for sent in parsed_abs.sents:
    print('')
    print(' '.join([token.lemma_ if token.lemma_ != '-PRON-' else token.text.lower()
                    for token in sent if not remove(token)]))


we simulate evolution dust universe z=1089 z=0 numerically integrate einstein 's equation spatially flat friedmann lemaire robertson walker flrw background spacetime scalar perturbation derive matter power spectrum produce code anisotropies microwave background camb

to investigate effect primordial gravitational wave gws inhomogeneity universe add additional decaying divergenceless traceless primordial tensor perturbation initial amplitude 3 time 10 ^ -4 metric

we find primordial tensor perturbation suppress matter power spectrum 0.01 z=0 mode wave number similar

this suppression possible probe gws background future


In [26]:
#def pipeline(text)

#test_text = df.text[:10]

sentences = []
new_text = []

for doc in nlp.pipe(df.text, batch_size=50):
    assert doc.is_parsed
    sentences.append(len(list(doc.sents)))
    stmt = ' '.join([token.lemma_ if token.lemma_ != '-PRON-' else token.text.lower()
                     for token in doc if not remove(token)])
    new_text.append(stmt)

In [27]:
len(sentences)

49973

In [28]:
len(new_text)

49973

In [29]:
df['sentences'] = sentences

In [30]:
df['text'] = new_text

In [31]:
file = os.path.join('..','data','astro_intermediate.pickle')

with open(file, 'wb') as f:
    pickle.dump(df, f) 

In [32]:
print(nlp.meta['lang'])
print(nlp.meta['pipeline'])

en
['tagger', 'parser', 'ner']


In [33]:
file = os.path.join('..','data','astro_intermediate.csv')

df.to_csv(file)