In [1]:
import re
import spacy
import logging
import gensim
import json
import pandas as pd
import multiprocessing

from time import time
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [6]:
SENTENCES_FILE = './sentences.txt'
DATA_FILE = './arxiv-metadata-oai-snapshot.json'

nlp = spacy.load("en_core_web_sm", exclude=["parser"])
nlp.enable_pipe("senter")

In [3]:
MIN_SENT_LEN = 5
def cleaning(sent, l):
    txt = [token.lemma_.lower() 
                for token in sent if not token.is_stop and token.is_alpha]
    if len(txt) >= MIN_SENT_LEN:
        # tag phrases as: 'data cleaning' -> 'data_cleaning'
        terms = find_ngram(txt)
        out = ' '.join(txt) + '\n'
        for term in terms:
            out = out.replace(term, term.replace(' ', '_'))
        l.append(out)

In [7]:
in_file = open(DATA_FILE, 'r')
out_file = open(SENTENCES_FILE, 'w+', buffering=10000000)
count = 0
start = 1230001

t = time()
for line in in_file.readlines():
    if count < start:
        count += 1
        if count == start:
            print('start to process')
        continue

    paper = json.loads(line)
    abstract = paper['abstract'].strip().replace('-\n', '').replace('\n', ' ')

    doc = nlp(abstract)
    txt = []
    for sent in doc.sents:
        cleaning(sent, txt)
    if len(txt) > 0:
        out_file.writelines(txt)
    
    count += 1
    if count % 50000 == 0:
        print('finished %d in %f min' % (count, round((time() - t) / 60, 3)))

in_file.close()
out_file.close()
print('finished in {} min'.format(round((time() - t) / 60, 2)))

start to process
finished 1250000 in 6.076000 min
finished 1300000 in 20.210000 min
finished 1350000 in 34.547000 min
finished 1400000 in 48.820000 min
finished 1450000 in 63.108000 min
finished 1500000 in 77.275000 min
finished 1550000 in 92.255000 min
finished 1600000 in 103.979000 min
finished 1650000 in 115.073000 min
finished 1700000 in 125.242000 min
finished 1750000 in 135.275000 min
finished 1800000 in 145.037000 min
finished 1850000 in 154.648000 min
finished in 164.54 min


In [2]:
file = open(SENTENCES_FILE)
sentences = LineSentence(file)

In [3]:
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=1,vector_size=128,negative=5,workers=cores-2)

In [None]:
def print_time(t):
    print('finished in {} mins'.format(round((time() - t) / 60, 3)))

In [4]:
t = time()
# construct vocab set
w2v_model.build_vocab(sentences, progress_per=1500000)

print_time(t)

finished in 0.924 mins


In [5]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, 
                epochs=60, report_delay=60)

print_time(t)

finished training in 108.43 mins


In [28]:
w2v_model.save('word2vec_model/word2vec.model')

In [27]:
#w2v_model.wv.similarity('optimal', 'optimization')
w2v_model.wv.most_similar('happy')

[('disgusted', 0.47324278950691223),
 ('disgust', 0.47088298201560974),
 ('unhappy', 0.46743878722190857),
 ('sadness', 0.4506657123565674),
 ('event_source_code', 0.4493028521537781),
 ('bryn', 0.4440755546092987),
 ('dissatisfied', 0.4411503076553345),
 ('brainy', 0.4173509478569031),
 ('angry', 0.41401079297065735),
 ('emotion', 0.4129962623119354)]

In [4]:
KEYWORD_FILE = 'computer_science_keywords.csv'
KEYWORDS = pd.read_csv(KEYWORD_FILE, header=None, index_col=False)
KEYWORDS.drop(columns=[2, 3, 4, 5], inplace=True)
KEYWORDS.rename(columns={0: 'terms', 1: 'num'}, inplace=True)
KEYWORDS['terms'] = KEYWORDS['terms'].astype(str)

keyword_freq = {}
for term, num in zip(KEYWORDS['terms'], KEYWORDS['num']):
    if num.isdigit():
        keyword_freq[term] = int(num)
    else:
        keyword_freq[term] = 0

del KEYWORDS

In [5]:
def ngram(input_list, ngram_num):
    '''Create a list of ngrams given a list of words.'''
    ngram_list = []
    if len(input_list) >= ngram_num:
        for tmp in zip(*[input_list[i:] for i in range(ngram_num)]):
            ngram = ''
            for word in tmp:
                ngram += word + ' '
            ngram_list.append(ngram[:-1])
    return ngram_list

def find_ngram(nouns):
    terms = set()
    for i in reversed(range(2,5)):
        ngram_list = ngram(nouns, i)
        for term in ngram_list:
            if term in keyword_freq:
                terms.add(term)
    terms = list(terms)
    terms.sort(key=lambda ele:len(ele), reverse=True)
    return terms