In [1]:
import glob
import json
import requests
from config import Config
import spotlight
import nltk

# Load data

In [2]:
def input_data(text_file_directory=Config.TEXT_DATA_PATH):
    """
    Reads the text documents contained in text_file_directory
    :param text_file_directory: Path to directory (String)
    :return: List where one element in a word in the text. The text order is kept. (List of String)
    """
    list_of_words = []
    for file_path in glob.glob(text_file_directory):
        with open(file_path, 'r', errors="ignore") as f:
            content = ''.join(f.readlines())
            list_of_words.append(content)
    return list_of_words

In [3]:
def pprint(data):
    print(json.dumps(data, sort_keys=True, indent=4))

In [4]:
data = input_data()

# Entity Mapping (DBpedia Spotlight) 

## Demo of the API

In [5]:
annotations = spotlight.annotate('https://api.dbpedia-spotlight.org/en/annotate?', 'President Obama on Monday will call for a new minimum tax rate for individuals making more than $1 million a year to ensure that they pay at least the same percentage of their earnings as other taxpayers, according to administration officials.', confidence=0.4, support=20)

In [6]:
a = 'President Obama on Monday will call for a new minimum tax rate for individuals making more than $1 million a year to ensure that they pay at least the same percentage of their earnings as other taxpayers, according to administration officials.'
a[54:54+len("tax")]

'tax'

In [7]:
pprint(annotations)

[
    {
        "URI": "http://dbpedia.org/resource/Barack_Obama",
        "offset": 0,
        "percentageOfSecondRank": 0.014585380366391662,
        "similarityScore": 0.985624294762507,
        "support": 25941,
        "surfaceForm": "President Obama",
        "types": "Http://xmlns.com/foaf/0.1/Person,Wikidata:Q82955,Wikidata:Q5,Wikidata:Q30461,Wikidata:Q24229398,Wikidata:Q215627,DUL:NaturalPerson,DUL:Agent,Schema:Person,DBpedia:President,DBpedia:Politician,DBpedia:Person,DBpedia:Agent"
    },
    {
        "URI": "http://dbpedia.org/resource/Tax",
        "offset": 54,
        "percentageOfSecondRank": 0.0010311806078409575,
        "similarityScore": 0.9989115305036729,
        "support": 14066,
        "surfaceForm": "tax",
        "types": ""
    }
]


In [8]:
annotations = spotlight.annotate('https://api.dbpedia-spotlight.org/en/annotate?', data[0], confidence=0.5, support=20)

In [9]:
# pprint(annotations)

## Retrive Unique Resource Identifier (URI)

In [10]:
def retrive_uri(data, url="https://api.dbpedia-spotlight.org/en/annotate?", confidence=0.5, support=20):
    tags = []
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    annotations = spotlight.annotate(url, data, confidence=confidence, support=support)
    for annotation in annotations:
        uri = annotation.get("URI")
        start_idx = annotation.get("offset")
        word = annotation.get("surfaceForm")
        end_idx = start_idx + len(word)
        if annotation.get("types"):
            list_of_types = [elt for elt in annotation["types"].split(',') if "DBpedia" in elt]
            type_ = list_of_types[0]
        else:
            type_ = "ex:" + uri.split("/")[-1]
        tags.append((word, uri, type_, start_idx, end_idx))
    return tags

In [11]:
l = retrive_uri(data[0])

In [63]:
print("\n".join([str(elt) for elt in l]))

('China', 'http://dbpedia.org/resource/China', 'DBpedia:PopulatedPlace', 9, 14)
('VAT', 'http://dbpedia.org/resource/Value-added_tax', 'ex:Value-added_tax', 39, 42)
('awash', 'http://dbpedia.org/resource/Awash_River', 'DBpedia:WorldHeritageSite', 81, 86)
('VAT', 'http://dbpedia.org/resource/Value-added_tax', 'ex:Value-added_tax', 138, 141)
('fertilizer', 'http://dbpedia.org/resource/Fertilizer', 'ex:Fertilizer', 145, 155)
('China', 'http://dbpedia.org/resource/China', 'DBpedia:PopulatedPlace', 165, 170)
('Chinese', 'http://dbpedia.org/resource/China', 'DBpedia:PopulatedPlace', 319, 326)
('FOB', 'http://dbpedia.org/resource/Forward_operating_base', 'ex:Forward_operating_base', 502, 505)
('CFR', 'http://dbpedia.org/resource/Code_of_Federal_Regulations', 'DBpedia:WrittenWork', 873, 876)
('India', 'http://dbpedia.org/resource/India', 'DBpedia:PopulatedPlace', 936, 941)
('Pakistan', 'http://dbpedia.org/resource/Pakistan', 'DBpedia:PopulatedPlace', 945, 953)
('Chinese', 'http://dbpedia.org/r

# Coreference Resolution

`java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000`

In [15]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')

In [16]:
def find_coreferences_text(data):
    output = nlp.annotate(data, properties= {'annotators':'tokenize, ssplit, pos, lemma, ner, parse, dcoref','outputFormat':'json','ner.useSUTime':'false', 'timeout': '500000'})
    return output

In [17]:
output = find_coreferences_text(data[0])

In [80]:
def extract_coreferences(ouput):
    entities = []
    for i in output['corefs'].keys():
        if len(output['corefs'][i]) > 2:
            entity = []
            set_entity = set()
            for ref in output['corefs'][i]:
                if ref['text'] not in set_entity:
                    entity.append((ref['text'], ref['startIndex']))
                    set_entity.add(ref['text'])
            entities.append(entity)
    return entities

In [None]:
output

In [81]:
results = extract_coreferences(output)

In [82]:
results[:20]

[[('ANALYSIS China', 1), ('China', 27), ('FOB China', 16)],
 [('lower prices', 8), ('these lower prices', 3), ('The lower prices', 1)],
 [('The market', 10), ('it', 5)],
 [('fertilizer sales in China', 24), ('fertilizer sales', 15)],
 [('non-Indian', 31)],
 [('first', 40)],
 [('August shipment', 11)],
 [('some weaker prices', 16),
  ('prices', 11),
  ('the prices', 29),
  ('our prices', 6)],
 [('non-Indian markets in the $ 450s/mt FOB', 31), ('non-Indian markets', 12)],
 [('the $ 450s/mt FOB', 34)],
 [('August', 43), ('early August', 56)],
 [('last year', 44), ('the year', 18), ('this year', 19)],
 [('18-46-0', 14), ('18-46-0 , i.e. product', 28), ('i.e. product', 30)],
 [('Suppliers looking to place lighter coloured product or product that is not guaranteed 18-46-0',
   1),
  ('suppliers', 8)],
 [('India', 12), ('it', 16)],
 [('Pakistan', 14), ('S. INDIAN SUBCONTINENT Pakistan', 25)],
 [('US', 14), ('United Suppliers', 52), ('US Gulf', 26)],
 [('the US', 13),
  ('the US which with a $

In [30]:
def resolve(corenlp_output):
    """ Transfer the word form of the antecedent to its associated pronominal anaphor(s) """
    for coref in corenlp_output['corefs']:
        mentions = corenlp_output['corefs'][coref]
        antecedent = mentions[0]  # the antecedent is the first mention in the coreference chain
        print(antecedent)
        for j in range(1, len(mentions)):
            mention = mentions[j]
            if mention['type'] == 'PRONOMINAL':
                # get the attributes of the target mention in the corresponding sentence
                target_sentence = mention['sentNum']
                target_token = mention['startIndex'] - 1
                # transfer the antecedent's word form to the appropriate token in the sentence
                corenlp_output['sentences'][target_sentence - 1]['tokens'][target_token]['word'] = antecedent['text']


def print_resolved(corenlp_output):
    """ Print the "resolved" output """
    possessives = ['hers', 'his', 'their', 'theirs']
    for sentence in corenlp_output['sentences']:
        for token in sentence['tokens']:
            output_word = token['word']
            # check lemmas as well as tags for possessive pronouns in case of tagging errors
            if token['lemma'] in possessives or token['pos'] == 'PRP$':
                output_word += "'s"  # add the possessive morpheme
            output_word += token['after']
            print(output_word, end='')

In [16]:
text = "Obama was president of the USA. He was born in Hawaii."

output = nlp.annotate(text, properties= {'annotators':'dcoref','outputFormat':'json','ner.useSUTime':'false'})

# Relation Extractor

In [4]:
import os
from argparse import ArgumentParser
from subprocess import Popen
from sys import argv
from sys import stderr

In [30]:
def stanford_ie(input_filename, path_to_stanford_jar, verbose=True):

    command = f'cd {path_to_stanford_jar}; java -mx2g -cp "*" edu.stanford.nlp.naturalli.OpenIE {input_filename} -format ollie > ../out.txt'

    if verbose:
        print('Executing command = {}'.format(command), verbose)
        # java_process = Popen(command, stdout=stderr, shell=True)
        java_process = Popen(command, shell=True)
    else:
        java_process = Popen(command, stdout=stderr, stderr=open(os.devnull, 'w'), shell=True)
    java_process.wait()
    assert not java_process.returncode, 'ERROR: Call to stanford_ie exited with a non-zero code status.'

    with open("out.txt", 'r') as output_file:
        results_str = "\n".join(output_file.readlines())

    # results = process_entity_relations(results_str, verbose)
    return results_str

In [31]:
STANFORD_IE_FOLDER = "stanford-corenlp-full-2018-02-27/"
JAVA_BIN_PATH = "java"

In [32]:
a = stanford_ie("/Users/adriengalamez/Documents/Research_Assistant/text.txt", "/Users/adriengalamez/Documents/Research_Assistant/stanford-corenlp-full-2018-02-27/")
print(a)

Executing command = cd /Users/adriengalamez/Documents/Research_Assistant/stanford-corenlp-full-2018-02-27/; java -mx2g -cp "*" edu.stanford.nlp.naturalli.OpenIE /Users/adriengalamez/Documents/Research_Assistant/text.txt -format ollie > ../out.txt True
1.000: (Obama; was; president)

1.000: (Obama; was president of; USA)

1.000: (He; was born in; Hawaii)

1.000: (He; was; born)



In [18]:
text = "Obama was president of the USA. He was born in Hawaii."

output = nlp.annotate(text, properties= {'annotators':'tokenize,ssplit,pos,depparse, natlog, openie', 'outputFormat':'json','openie.format': 'ollie'})

In [19]:
output

{'sentences': [{'basicDependencies': [{'dep': 'ROOT',
     'dependent': 3,
     'dependentGloss': 'president',
     'governor': 0,
     'governorGloss': 'ROOT'},
    {'dep': 'nsubj',
     'dependent': 1,
     'dependentGloss': 'Obama',
     'governor': 3,
     'governorGloss': 'president'},
    {'dep': 'cop',
     'dependent': 2,
     'dependentGloss': 'was',
     'governor': 3,
     'governorGloss': 'president'},
    {'dep': 'case',
     'dependent': 4,
     'dependentGloss': 'of',
     'governor': 6,
     'governorGloss': 'USA'},
    {'dep': 'det',
     'dependent': 5,
     'dependentGloss': 'the',
     'governor': 6,
     'governorGloss': 'USA'},
    {'dep': 'nmod',
     'dependent': 6,
     'dependentGloss': 'USA',
     'governor': 3,
     'governorGloss': 'president'},
    {'dep': 'punct',
     'dependent': 7,
     'dependentGloss': '.',
     'governor': 3,
     'governorGloss': 'president'}],
   'enhancedDependencies': [{'dep': 'ROOT',
     'dependent': 3,
     'dependentGloss': 