# StanfordCoreNLP parser
##### Currently using the parser only, add NER later
git - https://github.com/smilli/py-corenlp
<br>
how to run local web server - https://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
<br>
on output formats - https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
<br><br>
Run in cmd to start server
<br>
cd C:\stanford-corenlp-full-2017-06-09
<br>
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000


API server setup

In [1]:
from pycorenlp import StanfordCoreNLP
import re

# Initiate CorNLP object
nlp = StanfordCoreNLP('http://localhost:9000')

output = nlp.annotate("Bears bear with other bears.", properties={
                'annotators': 'pos',
                'outputFormat': 'text' # json, xml, 
         })
print(output)

Sentence #1 (6 tokens):
Bears bear with other bears.
[Text=Bears CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=NNS]
[Text=bear CharacterOffsetBegin=6 CharacterOffsetEnd=10 PartOfSpeech=VBP]
[Text=with CharacterOffsetBegin=11 CharacterOffsetEnd=15 PartOfSpeech=IN]
[Text=other CharacterOffsetBegin=16 CharacterOffsetEnd=21 PartOfSpeech=JJ]
[Text=bears CharacterOffsetBegin=22 CharacterOffsetEnd=27 PartOfSpeech=NNS]
[Text=. CharacterOffsetBegin=27 CharacterOffsetEnd=28 PartOfSpeech=.]



# Pre-process function

Raw text preprocessor

In [2]:
from nltk.stem import WordNetLemmatizer
import re

def preprocess(file_dir):
    """
    Removes special chars , title
    Normalizes spaces>2  to one
    """

    title = re.compile(r"%&%.*%&%")
    special_chars = re.compile(r"[!@##$$%^&*(),:\"]") 
    parag_tag = re.compile("<p>")
    
    text = open(file_dir, 'r', encoding = 'utf-8').read()
    
    text = re.sub(title, "", text)
    text = re.sub(special_chars, "", text)
    text = re.sub(parag_tag, "", text)
    text = re.sub("\s{2,}", " ",text) # Normalize 2 > whitespace to 1 whitespace
    
    return text

In [3]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag): 
    """
    Changes treebank tags to Wordnet tags to be fed into the WordNet lemmatizer
    Returns -1 except for ADJ, VERB, NOUN, ADV
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
#     elif treebank_tag.startswith('R'):
#         return wordnet.ADV
    else:
        return -1

In [5]:
from nltk.tokenize import sent_tokenize


def parse_text(text, show_pos = True):
    """
    Parses text via the Stanford parser, filters outuput based on POS tags to be fed into lemmatizer   
    Removes stop words from the output
    
    Params
    lemmatize: uses WordNet to normalized words. If pos_filter ins enabled, POS is fed into the lemmatizer too. 
    For a list of POS recognized by the WordNet lemmatizer, refer to get_wordnet_pos
    stopWords_filter: removes words that are in the stop-word list
    show_pos = returns POS tags with words as a pair
    
    API parameters for Standford parser
    annotators: tokenize, ssplit, pos, lemma, ner, parse, dcoref
    outputFormats: text, json, xml, Serialized
    """              
        
    text = text.lower()
    
    output = nlp.annotate(text, properties={
            'annotators': 'ssplit, pos', 
            'outputFormat': 'json'
            })
    
    word_tags_list = []
    for sentence in output['sentences']:
        for item in sentence['tokens']:
            
            word = item['word']
            pos = item['pos']
            
            # Append to list
            if (show_pos == False):
                word_tags_list.append(word)
            elif (show_pos == True):
                word_tags_list.append(word + "_"+ pos)
            
    return word_tags_list

In [6]:
parse_text("I like you a/@!#?#$RE,.';'")

['i_LS',
 'like_IN',
 'you_PRP',
 'a_DT',
 '/_:',
 '@_SYM',
 '!_.',
 '#_#',
 '?_.',
 '#_#',
 '$_$',
 're_JJ',
 ',_,',
 '._.',
 "'_''",
 ';_:',
 "'_''"]

In [None]:
# Do not add to list if:
def lemmatizer(listOfWords):
    
    
def removeStopWords(listOfWords):
    
    
    stopWords = []
    stopWords = stopwords.words('English')
        with open(r"C:\nlp\extra_stopwords.txt", 'r', encoding = 'UTF-8') as f:
            extra_stopWords = f.read()
            extra_stopWords = extra_stopWords.split("\n")
            stopWords.append(extra_stopWords)
            
    if (lemmatize == True):
        wordNet = WordNetLemmatizer()

            
            if (get_wordnet_pos(pos) == -1): # kept POS: ADJ, VERB, NOUN, ADV 
                continue
            if (stopWords_filter == True) & (word in stopWords): 
                continue
            
            if (lemmatize == True):
                word = wordNet.lemmatize(word)
            

### Show difference w/ and w/o lemmatize 
- notice plural words and verbs lemmatzied to their original form

In [8]:
import os
directory = r"C:\nlp\Science-related texts"
file_list = [file for file in os.listdir(directory) if file.endswith('.txt')]

# Use 1st file as example
temp_dir = directory+"\\"+file_list[0]

In [9]:
from pprint import pprint
temp = preprocess(temp_dir)
lemmatized = parse_text(temp,lemmatize=False, show_pos=True)
not_lemmatized = parse_text(temp,lemmatize=True, show_pos=True)

pprint(list(zip(lemmatized, not_lemmatized)))

[('section_NN', 'section_NN'),
 ('museum_NN', 'museum_NN'),
 ('centenary_JJ', 'centenary_JJ'),
 ('birth_NN', 'birth_NN'),
 ('margaret_NN', 'margaret_NN'),
 ('mead_NN', 'mead_NN'),
 ('is_VBZ', 'is_VBZ'),
 ('remembered_VBN', 'remembered_VBN'),
 ('world_NN', 'world_NN'),
 ('influential_JJ', 'influential_JJ'),
 ('anthropologist_NN', 'anthropologist_NN'),
 ('margaret_NN', 'margaret_NN'),
 ('mead_NN', 'mead_NN'),
 ('arrived_VBD', 'arrived_VBD'),
 ('american_JJ', 'american_JJ'),
 ('museum_NN', 'museum_NN'),
 ('natural_JJ', 'natural_JJ'),
 ('history_NN', 'history_NN'),
 ('having_VBG', 'having_VBG'),
 ('completed_VBN', 'completed_VBN'),
 ('first_JJ', 'first_JJ'),
 ('significant_JJ', 'significant_JJ'),
 ('ethnographic_JJ', 'ethnographic_JJ'),
 ('research_NN', 'research_NN'),
 ('samoa_NN', 'samoa_NN'),
 ('was_VBD', 'wa_VBD'),
 ('appointed_VBN', 'appointed_VBN'),
 ('assistant_JJ', 'assistant_JJ'),
 ('curator_NN', 'curator_NN'),
 ('department_NN', 'department_NN'),
 ('anthropology_NN', 'anthropolog