### StanfordCoreNLP for processing
##### Currently using the tagger only
git - https://github.com/smilli/py-corenlp
<br>
how to run local web server - https://stanfordnlp.github.io/CoreNLP/corenlp-server.html#getting-started
<br>
on output formats - https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
<br><br>
Run in cmd to start server
<br>
cd C:\stanford-corenlp-full-2017-06-09
<br>
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000


Example usage

In [69]:
from pycorenlp import StanfordCoreNLP
import re

# Initiate CorNLP object
nlp = StanfordCoreNLP('http://localhost:9000')

output = nlp.annotate("Bears bear with other bears.", properties={
                'annotators': 'pos',
                'outputFormat': 'text' # json, xml, 
         })
print(output)

Sentence #1 (6 tokens):
Bears bear with other bears.
[Text=Bears CharacterOffsetBegin=0 CharacterOffsetEnd=5 PartOfSpeech=NNS]
[Text=bear CharacterOffsetBegin=6 CharacterOffsetEnd=10 PartOfSpeech=VBP]
[Text=with CharacterOffsetBegin=11 CharacterOffsetEnd=15 PartOfSpeech=IN]
[Text=other CharacterOffsetBegin=16 CharacterOffsetEnd=21 PartOfSpeech=JJ]
[Text=bears CharacterOffsetBegin=22 CharacterOffsetEnd=27 PartOfSpeech=NNS]
[Text=. CharacterOffsetBegin=27 CharacterOffsetEnd=28 PartOfSpeech=.]



# Pre-process function

In [278]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
from nltk.stem import WordNetLemmatizer
import re

with open(r"C:\nlp\extra_stopwords.txt", 'r', encoding = 'UTF-8') as f:
    extra_stopWords = f.read()
    extra_stopWords = extra_stopWords.split("\n")

def preprocess(file_dir):
    
    ################### Settings ###################
    #Stop words
    stopWords = set(stopwords.words('english'))
    stopWords.update(extra_stopWords)
    stopWords = list(stopWords)[1:] # spliting by \n makes whitespace at index 0, get rid of it

    # WordNet can take POS tags too
    lemmatizer = WordNetLemmatizer()

    # Title pattern
    title = re.compile(r"%&%.*%&%")
    
    # Special chars and tags removal
    special_chars = re.compile(r"[!@##$$%^&*(),:'\"]")
    paragraph_tag = re.compile(r"<p>")
    
    # To parse Stanford NLP output
    word_pattern = re.compile(r'\[Text=[A-Za-z]+')
    pos_pattern = re.compile(r'=[A-Z]{,4}]')
    
    # For NLP output
    word_tags_list = []
    
    ################### NLP #######################
    # read
    text = open(file_dir, 'r', encoding = 'utf-8').read()

    # Erase title
    text = re.sub(title, "", text)
    # Special chars
    text = re.sub(special_chars, "", text)
    # Tags
    text = re.sub(paragraph_tag, "", text)
    
    # Divide into sentences
    sent_tokenize_list = sent_tokenize(text)

    # Feed each sentence to parser
    for sent in sent_tokenize_list:
        sent = sent.lower()
        output = nlp.annotate(sent, properties={
                    'annotators': 'pos', #                              Options: tokenize, ssplit, pos, lemma, ner, parse, dcoref
                    'outputFormat': 'text' # returned as a long string (Options: json, xml, Serialized )
             })
        parsed_sent = output.split("\r\n") # split each token (word & tag)

        for i, token in enumerate(parsed_sent[2:]):
#             print("Token", i, ":", token)
          
            word = re.search(word_pattern, token)
            pos = re.search(pos_pattern, token)
#             print("obj:", word)
            if word != None:
                word = word.group()
                if pos != None:
                    pos = pos.group() 
                    word = word.replace("[Text=", "")
                    pos = pos.replace("=", "").replace("]", "")  
#                     print("Word:", word)
#                     print("POS:", pos)  
                    word_tags_list.append( (word, pos) )
                #else: # if word is not alphabetical

                #print("Word not found:", word)
               # word_tags_list.append( (word, "N/A") )

    # filters stop words
    word_tags_list = [word for word in word_tags_list if word[0] not in stopWords]

    # lemmatization
#     text = [lemmatizer.lemmatize(word) for word in text]


    # Join word & tag
    word_tags_list = [ wordWithTag[0] + wordWithTag[1]   for wordWithTag in word_tags_list ]

    return word_tags_list # list of processed words

### Get file list from dir

In [270]:
import os

directory = r"C:\nlp\Science-related texts"
file_list = [file for file in os.listdir(directory) if file.endswith('.txt')]

print( len(file_list), "text files detected!")

10258 text files detected!


In [276]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt


def draw_cloud(text_string):
#     wordcloud = WordCloud().generate(text = text_string)

#     plt.imshow(wordcloud, interpolation='bilinear')
#     plt.axis("off")

    # lower max_font_size
    wordcloud = WordCloud(max_font_size=400).generate(text = text_string)
    plt.figure()
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    
# For showing in a new window  
#     wordcloud = WordCloud().generate(text = text_string)
#     image = wordcloud.to_image()
#     image.show()


#### Word cloud for eachc article (control by index num)

#### Word Cloud for each genre  

In [282]:
acad_files = [file for file in file_list if file.startswith("acad")]
mag_files = [file for file in file_list if file.startswith("mag")]
news_files = [file for file in file_list if file.startswith("news")]

print( len(acad_files) , "academic texts\n",
       len(mag_files) , "magazine texts\n",
       len(news_files) , "newspaper texts")

4155 academic texts
 5781 magazine texts
 322 newspaper texts


Academic genre

In [283]:
import matplotlib.pyplot as plt
%matplotlib inline

all_text = ""

for file in acad_files:
    text = preprocess(os.path.join(directory, file))
    processed = ' '.join(text)
    all_text += processed 
    
print("Academic texts -", len(all_text), "words")

draw_cloud(all_text)

KeyboardInterrupt: 

Magazine genre

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

all_text = ""

for file in mag_files:
    text = preprocess(os.path.join(directory, file))
    processed = ' '.join(text)
    all_text += processed 
    
print("Magazine texts -", len(all_text), "words")

draw_cloud(all_text)

Newspaper genre

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

all_text = ""

for file in news_files:
    text = preprocess(os.path.join(directory, file))
    processed = ' '.join(text)
    all_text += processed 

print("Newspaper texts -", len(all_text), "words")

draw_cloud(all_text)

### * In case of converting list to counts

In [31]:
from collections import Counter
counts = dict(Counter(text))
print(counts)

{'sunlight': 4, 'central': 1, 'source': 7, 'united': 3, 'state': 4, 'energy': 24, 'st': 1, 'century': 2, 'sprawling': 1, 'low': 1, 'rise': 1, 'campus': 1, 'solar': 11, 'research': 7, 'institute': 2, 'seri': 6, 'answer': 2, 'resounding': 1, 'yes': 1, 'politically': 1, 'may': 1, 'coming': 2, 'time': 1, 'conventional': 4, 'power': 13, 'generation': 1, 'coal': 2, 'oil': 6, 'natural': 1, 'gas': 6, 'increasing': 1, 'fire': 1, 'environmentally': 2, 'conscious': 1, 'public': 2, 'concerned': 1, 'global': 2, 'warming': 2, 'acid': 1, 'rain': 1, 'air': 2, 'pollution': 2, 'also': 2, 'concern': 1, 'national': 1, 'security': 1, 'implication': 1, 'dependence': 1, 'nearly': 3, 'percent': 10, 'imported': 1, 'nuclear': 7, 'touted': 1, 'clean': 1, 'alternative': 5, 'suffering': 1, 'continuing': 2, 'revelation': 1, 'following': 2, 'explosion': 1, 'soviet': 1, 'reactor': 1, 'chernobyl': 1, 'station': 2, 'political': 1, 'difficulty': 1, 'surrounding': 1, 'start': 1, 'seabrook': 1, 'plant': 7, 'new': 2, 'hamp