# StanfordCoreNLP parser
##### Currently using the parser only, add NER later
git - https://github.com/smilli/py-corenlp
<br>
how to run local web server - https://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
<br>
on output formats - https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
<br><br>
Run in cmd to start server
<br>
cd C:\stanford-corenlp-full-2017-06-09
<br>
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000


API server setup

In [15]:
from pycorenlp import StanfordCoreNLP
import re

# Initiate CorNLP object
nlp = StanfordCoreNLP('http://localhost:9000')

output = nlp.annotate("Bears bear with other bears.", properties={
                'annotators': 'pos',
                'outputFormat': 'text' # json, xml, 
         })
print(output)

Sentence #1 (6 tokens):
Bears bear with other bears.
[Text=Bears CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=NNS]
[Text=bear CharacterOffsetBegin=6 CharacterOffsetEnd=10 PartOfSpeech=VBP]
[Text=with CharacterOffsetBegin=11 CharacterOffsetEnd=15 PartOfSpeech=IN]
[Text=other CharacterOffsetBegin=16 CharacterOffsetEnd=21 PartOfSpeech=JJ]
[Text=bears CharacterOffsetBegin=22 CharacterOffsetEnd=27 PartOfSpeech=NNS]
[Text=. CharacterOffsetBegin=27 CharacterOffsetEnd=28 PartOfSpeech=.]



# Pre-process function

Raw text preprocessor

In [6]:
from nltk.stem import WordNetLemmatizer
import re

def preprocess(file_dir):
    """
    Removes special chars , title
    Normalizes spaces>2  to one
    """

    title = re.compile(r"%&%.*%&%")
    special_chars = re.compile(r"[!@##$$%^&*(),:\"]") 
    parag_tag = re.compile("<p>")
    
    text = open(file_dir, 'r', encoding = 'utf-8').read()
    
    text = re.sub(title, "", text)
    text = re.sub(special_chars, "", text)
    text = re.sub(parag_tag, "", text)
    text = re.sub("\s{2,}", " ",text) # Normalize 2 > whitespace to 1 whitespace
    
    return text

In [18]:
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag, acceptPeriods = False): 
    """
    Changes treebank tags to Wordnet tags to be fed into the WordNet lemmatizer
    Returns -1 except for ADJ, VERB, NOUN, ADV
    
    periods can be accepted with acceptPeriods
    tags for periods are returned as 1
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
#     elif treebank_tag.startswith('R'):
#         return wordnet.ADV
    elif (acceptPeriods == True )& (treebank_tag == "."):
        return 1
    else:
        return -1

In [10]:
def parse_text(text, show_pos = True):
    """
    Parses text via the Stanford parser, filters outuput based on POS tags to be fed into lemmatizer   
    Removes stop words from the output
    
    Params
    show_pos = returns POS tags with words as a pair
    
    API parameters for Standford parser
    annotators: tokenize, ssplit, pos, lemma, ner, parse, dcoref
    outputFormats: text, json, xml, Serialized
    """              
    
    output = nlp.annotate(text, properties={
            'annotators': 'ssplit, pos', 
            'outputFormat': 'json'
            })
    
    word_tags_list = []
    for sentence in output['sentences']:
        for item in sentence['tokens']:
            
            word = item['word']
            pos = item['pos']
            
            # Append to list
            if (show_pos == False):
                word_tags_list.append(word)
            elif (show_pos == True):
                word_tags_list.append(word + "_"+ pos)
            
    return word_tags_list

In [42]:
parse_text("I like your friend")

['I_PRP', 'like_VBP', 'your_PRP$', 'friend_NN']

In [11]:
from nltk.stem import WordNetLemmatizer

def lemmatizer(listOfWords, pos = [], acceptPeriods = False):
    """
    Takes list of words and pos and returns a list of lemmas
    A list of POS tags can be supplied with words to be fed into WordNet lemmatizer 
    param acceptPeriods will return periods as periods
    """
    wordNet = WordNetLemmatizer()

    lemmatized_list = []

    if len(pos) == 0:
        for word in listOfWords:
            lemmatized_list.append(wordNet.lemmatize(word))
        return lemmatized_list

    else: # if pos list was given
        for i, word in enumerate(listOfWords):
            if (get_wordnet_pos(pos[i]) == -1):
                continue
            if (get_wordnet_pos(pos[i]) == 1):
                lemmatized_list.append
            elif (get_wordnet_pos(pos[i]) != -1):
                word = wordNet.lemmatize(word, get_wordnet_pos(pos[i]))
                lemmatized_list.append(word)
                
        return lemmatized_list

In [12]:
from nltk.corpus import stopwords

def removeStopWords(listOfWords):
    stopWords = []
    stopWords = stopwords.words('English')
    with open(r"C:\nlp\extra_stopwords.txt", 'r', encoding = 'UTF-8') as f:
        extra_stopWords = f.read()
        extra_stopWords = extra_stopWords.split("\n")
        stopWords.append(extra_stopWords)
            
    listOfWords = [word for word in listOfWords if word not in stopWords]
    return listOfWords

### Show difference w/ and w/o lemmatize 
- notice plural words and verbs lemmatzied to their original form

In [13]:
import os
directory = r"C:\nlp\Science-related texts"
file_list = [file for file in os.listdir(directory) if file.endswith('.txt')]

# Use 1st file as example
temp_dir = directory+"\\"+file_list[0]

In [16]:
from pprint import pprint
temp = preprocess(temp_dir)
parsed = parse_text(temp)

temp = [item.split("_") for item in parsed]
words = [ item[0] for item in temp]
pos = [ item[1] for item in temp]

lemmatized = lemmatizer(words, pos)
print(lemmatized)

KeyError: '.'

In [72]:
periods = (i for i, word in enumerate(words) if word == '.')

In [73]:
next(periods)

21

In [74]:
words[21]

'.'

In [79]:
lemmatizer('.','.')

[]