# Data Preparation

In [1]:
import time
import json
import pandas as pd
from rake_nltk import Rake
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import punkt

In [2]:
# a decorator for calculating function run time
import time
def print_run_time(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        back = func(*args, **kwargs)
        print('Function [%s] run time is %.2fs' % (func.__name__ , time.time() - start_time))
        return back
    return wrapper

In [3]:
def get_metadata(file_path):
    with open(file_path) as f:
        for line in f:
            yield line

In [4]:
def print_variable_names(metadata):
    for paper in metadata:
        first_paper = json.loads(paper)
        break

    print("We have the following variables in each record(one paper):")
    for key in first_paper:
        print(key)

In [5]:
def extract_keywords(r, text):
    r.extract_keywords_from_text(text)
    ranked_phrases = r.get_ranked_phrases_with_scores()
    keywords = []
    i = 1
    for i, (score, keyword) in enumerate(ranked_phrases):
        if i >= 10 or score <= 1:
            break
        keywords.append(keyword)
    return keywords

In [20]:
@print_run_time
def generate_data_json(metadata):
    titles = []
    abstracts = []
    keywords = []
    total_items = 0
    r = Rake(
        # stopwords=stopwords,
        # punctuations=punkt,
        # min_length=1, 
        max_length=2)
    wnl = WordNetLemmatizer()
    for paper in metadata:
        paper = json.loads(paper)
        total_items += 1
        # replace \n with space
        titles.append(paper['title'].replace('\n', ' '))
        # get abstract and its corresponding keywords
        abstract = paper['abstract'].replace('\n', ' ')
        abstracts.append(abstract)
        keywords.append(extract_keywords(r, abstract))
    print(f'Total number of items is: {total_items}')

    d = {
        'title': titles,
        'abstract': abstracts,
        'keywords': keywords
    }

    return d

In [21]:
metadata = get_metadata('../data/arxiv-metadata-oai-snapshot.json')
print('metadata loaded ready')
dataset = generate_data_json(metadata)

metadata loaded ready
Total number of items is: 1796911
Function [generate_data_json] run time is 728.74s


In [22]:
df = pd.DataFrame(dataset)

In [27]:
df.iloc[2]['abstract']

"  The evolution of Earth-Moon system is described by the dark matter field fluid model proposed in the Meeting of Division of Particle and Field 2004, American Physical Society. The current behavior of the Earth-Moon system agrees with this model very well and the general pattern of the evolution of the Moon-Earth system described by this model agrees with geological and fossil evidence. The closest distance of the Moon to Earth was about 259000 km at 4.5 billion years ago, which is far beyond the Roche's limit. The result suggests that the tidal friction may not be the primary cause for the evolution of the Earth-Moon system. The average dark matter field fluid constant derived from Earth-Moon system data is 4.39 x 10^(-22) s^(-1)m^(-1). This model predicts that the Mars's rotation is also slowing with the angular acceleration rate about -4.38 x 10^(-22) rad s^(-2). "

In [31]:
text = """The evolution of Earth-Moon system is described by the dark matter field fluid model proposed in the Meeting of Division of Particle and Field 2004, American Physical Society. The current behavior of the Earth-Moon system agrees with this model very well and the general pattern of the evolution of the Moon-Earth system described by this model agrees with geological and fossil evidence. The closest distance of the Moon to Earth was about 259000 km at 4.5 billion years ago, which is far beyond the Roche's limit. The result suggests that the tidal friction may not be the primary cause for the evolution of the Earth-Moon system. The average dark matter field fluid constant derived from Earth-Moon system data is 4.39 x 10^(-22) s^(-1)m^(-1). This model predicts that the Mars's rotation is also slowing with the angular acceleration rate about -4.38 x 10^(-22) rad s^(-2)."""
r = Rake(max_length=1)
r.extract_keywords_from_text(text)
r.get_ranked_phrases_with_scores()

[(1.0, 'well'),
 (1.0, 'rotation'),
 (1.0, 'roche'),
 (1.0, 'rad'),
 (1.0, 'particle'),
 (1.0, 'moon'),
 (1.0, 'model'),
 (1.0, 'meeting'),
 (1.0, 'mars'),
 (1.0, 'limit'),
 (1.0, 'geological'),
 (1.0, 'evolution'),
 (1.0, 'earth'),
 (1.0, 'division'),
 (1.0, 'described'),
 (1.0, '4')]

In [18]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2)

In [33]:

for head in ['title','abstract','keyword']:
    train[head].to_csv('../data/train/%s.txt'%head, index=False)

In [34]:
for head in ['title','abstract','keyword']:
    test[head].to_csv('../data/test/%s.txt'%head, index=False)