In [1]:
import json
import re
from collections import defaultdict

In [2]:
DATA_FILE = 'arxiv-metadata-oai-snapshot.json'
file = open(DATA_FILE, 'r')
count = 0
abstracts = []
for line in file.readlines():
    paper = json.loads(line)
    abstract = paper['abstract'].strip().replace('\n-', '').replace('\n', ' ')
    #abstract = abstract.lower()
    # remove latex formula
    abstract = re.sub(u"\\$$.*?$$|\\$.*?\\$-|\\$.*?\\$", "", abstract)
    abstracts.append(abstract)
    count += 1
    if count > 100000:
        break
file.close()

In [12]:
# partial sum: not even make sense to humans without math formula
# 2010, hardron physics (quark-mass dependence)
# 3415, Catalan numbers, minimal codimensional case, Combinatorics, path
# 10395, chaotical behavior ...
# 22936, Taylor series
# tree
# 1, decomposition, graph, algorithm
# 397, bit-comparison strategy, depth, asymptotics, renewals, distributional fluctuations ...
# 917, Bayesian network, conditional independence model ...
# 944, binary ...
# nn
# 91565, PDF, interactive, SOMs
# 94095, seismic data filtering
abstracts[944]

"We study fragmentation trees of Gibbs type. In the binary case, we identify the most general Gibbs-type fragmentation tree with Aldous' beta-splitting model, which has an extended parameter range  with respect to the  probability distributions on which it is based. In the multifurcating case, we show that Gibbs fragmentation trees are associated with the two-parameter Poisson--Dirichlet models for exchangeable random partitions of , with an extended parameter range ,  and , , ."

In [4]:
import spacy
from spacy.matcher import DependencyMatcher

nlp = spacy.load("en_core_web_sm")

In [33]:
doc = nlp('this is a good PRODUCT of scott that looks pretty, and my INTEREST toward light is gorgeous.')

In [34]:
spacy.displacy.render(doc, style='dep')

In [5]:
def _get_right_context_pattern(word, attr):
    right_attrs = {"LOWER": word}
    if attr:
        if type(attr) is str:
            attr = eval(attr)
        if type(attr) is dict:
            right_attrs = attr
    pattern1 = [
        {
            "RIGHT_ID": "target",
            "RIGHT_ATTRS": right_attrs
        },
        {
            "LEFT_ID": "target",
            "REL_OP": ">",
            "RIGHT_ID": "is",
            "RIGHT_ATTRS": {"DEP": {"IN": ["nsubj", "relcl"]}},
        },
        {
            "LEFT_ID": "is",
            "REL_OP": ">",
            "RIGHT_ID": "adj",
            "RIGHT_ATTRS": {"DEP": "acomp"},
        }
    ]
    pattern2 = [
        {
            "RIGHT_ID": "target",
            "RIGHT_ATTRS": right_attrs
        },
        {
            "LEFT_ID": "target",
            "REL_OP": ">",
            "RIGHT_ID": "prep",
            "RIGHT_ATTRS": {"DEP": {"IN": ["nsubj", "prep"]}},
        },
        {
            "LEFT_ID": "prep",
            "REL_OP": ">",
            "RIGHT_ID": "n",
            "RIGHT_ATTRS": {"POS": "NOUN"},
        }
    ]
    return [pattern1, pattern2]

In [6]:
def _get_left_context_pattern(word, attr):
    right_attrs = {"LOWER": word}
    if attr:
        if type(attr) is str:
            attr = eval(attr)
        if type(attr) is dict:
            right_attrs = attr
    pattern1 = [
        {
            "RIGHT_ID": "anchor_founded",
            "RIGHT_ATTRS": right_attrs
        },
        {
            "LEFT_ID": "anchor_founded",
            "REL_OP": "<",
            "RIGHT_ID": "founded_clue_right",
            "RIGHT_ATTRS": {"POS": "VERB"},
        },
        {
            "LEFT_ID": "founded_clue_right",
            "REL_OP": ">",
            "RIGHT_ID": "founded_verb",
            "RIGHT_ATTRS": {"POS": {"IN": ["NOUN", "ADJ", "ADV"]}},
        }
    ]
    return [pattern1]

In [7]:
def _get_compound_pattern(word, attr):
    right_attrs = {"LOWER": word}
    if attr:
        if type(attr) is str:
            attr = eval(attr)
        if type(attr) is dict:
            right_attrs = attr
    pattern1 = [
        {
            "RIGHT_ID": "anchor_founded",
            "RIGHT_ATTRS": right_attrs
        },
        {
            "LEFT_ID": "anchor_founded",
            "REL_OP": ">",
            "RIGHT_ID": "founded_compound",
            "RIGHT_ATTRS": {"DEP": {"IN": ["amod", "compound"]}},
        }
    ]
    return [pattern1]

In [8]:
def get_matcher(word, attr=''):
    matcher = DependencyMatcher(nlp.vocab)
    matcher.add(1, _get_left_context_pattern(word, attr))
    matcher.add(2, _get_right_context_pattern(word, attr))
    matcher.add(3, _get_compound_pattern(word, attr))
    return matcher

In [9]:
def summarize_result(result, coef1=1, coef2=.5, coef3=.5, pos_weights=None):
    temp_result = defaultdict(int)

    for pos,d in result[3].items():
        pos_weight = coef1
        if pos_weights and pos in pos_weights:
            pos_weight *= pos_weights[pos]
        for word,freq in d.items():
            temp_result[word] += freq * pos_weight

    for pos,d in result[1].items():
        pos_weight = coef2
        if pos_weights and pos in pos_weights:
            pos_weight *= pos_weights[pos]
        for word,freq in d.items():
            temp_result[word] += freq * pos_weight
    for pos,d in result[2].items():
        pos_weight = coef3
        if pos_weights and pos in pos_weights:
            pos_weight *= pos_weights[pos]
        for word,freq in d.items():
            temp_result[word] += freq * pos_weight
    
    return temp_result

def normalize(result):
    result = dict(result)
    total_score = sum(result.values())
    for word,score in result.items():
        result[word] = score / total_score
    return result

In [32]:
def compare_result(global_context, local_context, threshold=1.5):
    result = dict()
    for word,score in local_context.items():
        ratio = score / global_context[word]
        if ratio >= threshold:
            result[word] = ratio
    return result

In [77]:
word = 'data'
matcher = get_matcher(word)
result = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
count = 0

for abstract in abstracts:
    count += 1
    if count % 10000 == 0:
        print(count)

    abstract_lower = abstract.lower()
    if word not in abstract_lower or 'medic' not in abstract_lower:
        continue
    doc = nlp(abstract)
    matches = matcher(doc)
    for match_id, token_ids in matches:
        context_word = doc[token_ids[-1]]
        if not context_word.is_alpha:
            continue
        context_lemma_word = context_word.lemma_.lower()
        if context_lemma_word == word:
            continue
        pos = context_word.pos_
        result[match_id][pos][context_lemma_word] += 1

result = summarize_result(result)
result = normalize(result)
med_context = result

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000


In [79]:
# data
print('medical')
print(compare_result(global_context, med_context, threshold=10))

medical
{'patient': 48.800707547169814, 'network': 23.6609491137793, 'outcome': 97.60141509433963, 'incidence': 65.06761006289307, 'disability': 195.20283018867926, 'prostate': 195.20283018867926, 'cancer': 97.60141509433963, 'snp': 97.60141509433961, 'term': 39.04056603773585, 'eeg': 130.13522012578613, 'cost': 111.54447439353099, 'volume': 97.60141509433963, 'symptom': 195.20283018867923, 'medical': 195.20283018867926, 'cellular': 195.20283018867926, 'incomplete': 26.92452830188679, 'dimensional': 22.698003510311544, 'biased': 97.60141509433963, 'pixel': 97.60141509433963, 'longitudinal': 35.49142367066895, 'genetic': 27.886118598382748, 'primary': 65.06761006289307, 'epidemiologic': 65.06761006289307, 'actual': 10.844601677148846, 'correlated': 65.06761006289307, 'interictal': 195.20283018867926, 'ictal': 195.20283018867926, 'biomedical': 195.20283018867926, 'fraudulent': 195.20283018867926, 'biological': 13.943059299191374, 'verbal': 195.20283018867926, 'autopsy': 195.2028301886792

In [66]:
# transport
print('public')
print(compare_result(global_context, public_context, threshold=2))

public
{'public': 260.3, 'efficient': 32.5375, 'network': 28.922222222222224, 'receiver': 130.15}


In [60]:
# index
print('data')
print(compare_result(global_context, data_context, threshold=2))
print('optimize')
print(compare_result(global_context, optimiz_context, threshold=2))

data
{'activity': 2.2942602616147947, 'distribution': 2.4581359945872805, 'structure': 2.6220117275597654, 'variability': 3.933017591339649, 'aa': 3.933017591339649, 'view': 3.933017591339649, 'rate': 2.949763193504736, 'g': 3.933017591339648, 'submm': 3.933017591339649, 'opacity': 3.933017591339649, 'development': 3.933017591339649, 'knowledge': 3.933017591339649, 'consistency': 3.933017591339649, 'slope': 3.933017591339649, 'growth': 2.145282322548899, 'frustration': 3.933017591339649, 'line': 2.6220117275597654, 'balnicity': 2.949763193504736, 'sensitivity': 2.359810554803789, 'site': 3.933017591339649, 'modulation': 2.6220117275597654, 'r': 3.933017591339649, 'asymmetry': 3.933017591339649, 'powerlaw': 3.933017591339649, 'value': 2.528368451575488, 'generality': 3.933017591339649, 'turbulence': 3.933017591339648, 'size': 2.6220117275597654, 'mass': 2.2942602616147947, 'emission': 2.4581359945872805, 'age': 3.933017591339649, 'iron': 3.933017591339649, 'happiness': 3.933017591339649

In [46]:
# robot
print('motion')
print(compare_result(global_context, motion_context, threshold=2))

motion
{'parallel': 2.879032258064516, 'legged': 3.838709677419355, 'spherical': 3.838709677419355, 'cubic': 3.838709677419355, 'close': 3.838709677419355, 'develop': 3.838709677419355, 'universal': 3.838709677419355, 'continuously': 3.838709677419355, 'production': 3.8387096774193545, 'access': 3.838709677419355, 'joint': 3.838709677419355, 'motion': 3.838709677419355}


In [39]:
# algorithm
print('tree')
print(compare_result(global_context, tree_context, threshold=2))
print('optimize')
print(compare_result(global_context, optimiz_context, threshold=2))

tree
{'simple': 2.0250290925046257, 'second': 2.594434735851593, 'fundamental': 6.1978163134232505, 'modified': 4.648362235067437, 'hybrid': 5.070940620073568, 'optimal': 2.383775505162788, 'empirical': 18.59344894026975, 'powerful': 6.1978163134232505, 'exact': 2.0659387711410835, 'gedanken': 18.59344894026975, 'markovian': 9.296724470134874, 'additive': 18.59344894026975, 'competitive': 4.8504649409399345, 'constant': 7.4373795761079, 'polynomial': 2.4791265253693, 'generic': 3.0989081567116252, 'constructive': 4.648362235067437, 'treelet': 18.59344894026975, 'classic': 4.648362235067437, 'primary': 18.59344894026975, 'zvtop': 18.59344894026975, 'different': 9.296724470134874, 'chain': 6.562393743624617, 'ms': 18.59344894026975, 'knuth': 18.59344894026975, 'mst': 18.59344894026975, 'ctw': 18.59344894026975, 'mcmc': 5.312413982934214, 'joining': 18.59344894026975, 'fock': 18.59344894026975, 'bp': 7.4373795761079, 'fpt': 18.59344894026975, 'rhmc': 4.648362235067437, 'gem': 18.593448940

In [31]:
# model
print('structure')
print(compare_result(global_context, structure_context, threshold=2))
print('ml')
print(compare_result(global_context, ml_context, threshold=2))

structure
{'spatial': 2.705366847826087, 'qualitative': 2.2544723731884058, 'traditional': 2.898607336956522, 'rigorous': 2.2544723731884058, 'hernquist': 4.5089447463768115, 'monolithic': 5.072562839673913, 'icosahedral': 6.763417119565218, 'quasicrystal': 6.763417119565218, 'energetic': 2.705366847826087, 'secondary': 6.763417119565218, 'tight': 3.9784806585677748, 'nonadiabatic': 6.763417119565218, 'similar': 2.363121644185438, 'closed': 2.0810514214046822, 'strong': 2.705366847826087, 'variational': 3.381708559782609, 'thin': 2.2544723731884058, 'photochemical': 2.2544723731884058, 'layer': 2.618096949509116, 'cold': 3.9453266530797104, 'ionic': 2.705366847826087, 'special': 2.898607336956522, 'unstable': 4.0580502717391305, 'newtonian': 3.381708559782609, 'logical': 4.0580502717391305, 'structured': 6.763417119565218, 'radiative': 2.7849364609974425, 'itinerant': 4.83101222826087, 'full': 2.3999222037166903, 'evolutionary': 2.1642934782608694, 'fuzzy': 2.5362814198369565, 'studied