### Stage 0:
Load Text document to be statistical parsing/tagging from your current directory

Parse, scrub / cleanse and tag uploaded Text document

https://github.com/DerwenAI/pytextrank
https://github.com/DerwenAI/pytextrank/blob/master/example.ipynb
https://www.thinkinfi.com/2018/09/automatic-keyword-extraction-using_30.html
https://medium.com/@aneesha/beyond-bag-of-words-using-pytextrank-to-find-phrases-and-summarize-text-f736fa3773c5
https://xang1234.github.io/textrank/
https://gist.github.com/BrambleXu/3d47bbdbd1ee4e6fc695b0ddb88cbf99

Attribution
PyTextRank has an MIT license, which is succinct and simplifies use in commercial applications.

Please use the following BibTeX entry for citing PyTextRank in publications:

@Misc{PyTextRank,
author = {Nathan, Paco},
title = {PyTextRank, a Python implementation of TextRank for phrase extraction and summarization of text documents},
    howpublished = {\url{https://github.com/DerwenAI/pytextrank/}},
    year = {2016}
    }

INPUTS: Text doc for the text input  
OUTPUT: JSON format of the original text document, scrubed and put into id then text values

In [19]:
import json
import logging
import os
import unicodedata
from collections import OrderedDict

import numpy as np
import pytextrank
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

DEBUG = True # False # True


def cleanup_text (text):
    """
    It scrubs the garbled from its stream...
    Or it gets the debugger again.
    """
    x = " ".join(map(lambda s: s.strip(), text.split("\n"))).strip()

    x = x.replace('“', '"').replace('”', '"')
    x = x.replace("‘", "'").replace("’", "'").replace("`", "'")
    x = x.replace('…', '...').replace('–', '-')

    x = str(unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('ascii'))

    # some content returns text in bytes rather than as a str ?
    try:
        assert type(x).__name__ == 'str'
    except AssertionError:
        print("not a string?") # , type(line), line)

    return x

def pretty_print (obj, indent=False):
    """
    pretty print a JSON object
    """

    if indent:
        return json.dumps(obj, sort_keys=True, indent=2, separators=(',', ': '))
    else:
        return json.dumps(obj, sort_keys=True)

# https://gist.github.com/BrambleXu/3d47bbdbd1ee4e6fc695b0ddb88cbf99
# https://spacy.io/usage/linguistic-features
# https://spacy.io/api/doc
class TextRank4Keyword():
    """Extract keywords from text"""

    def __init__(self, nlp):
        self.nlp = nlp
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight
        self.doc = None


    def set_stopwords(self, stopwords):
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = self.nlp.vocab[word]
            lexeme.is_stop = True

    def sentence_segment(self, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in self.doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences

    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab

    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs

    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())

    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1

        # Get Symmeric matrix
        g = self.symmetrize(g)

        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm

        return g_norm


    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        keywords = dict()
        for i, (k, v) in enumerate(node_weight.items()):
            keywords[k] = v
            if i > number:
                break
        return keywords

    def get_phrases(self, number=10):
        phrases = list()
        for i, p in enumerate(self.doc._.phrases):
            if i >= number:
                break
            phrases.append(p)
        return phrases

    def get_sentences(self, number=5):
        sentences = list()
        for i, s in enumerate(self.doc.sents):
            if i >= number:
                break
            sentences.append(s)
        return sentences


    def analyze(self, text,
                candidate_pos=['NOUN', 'PROPN'],
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""

        # Set stop words
        self.set_stopwords(stopwords)

        # Pare text by spaCy
        self.doc = self.nlp(text)

        # Filter sentences
        sentences = self.sentence_segment(candidate_pos, lower) # list of list of words

        # Build vocabulary
        vocab = self.get_vocab(sentences)

        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)

        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)

        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))

        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]

        self.node_weight = node_weight

In [20]:
dirname = os.getcwd()
input_path_stage1 = os.path.abspath(os.path.join(dirname, "..", "data", "textrank.txt"))
with open(input_path_stage1, 'r') as f1:
    content = f1.read()


### Load the text file's contents for Natural Language Processing:

https://spacy.io/usage/linguistic-features

In [21]:
# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank", last=True)

# *************************************************************
# THIS STEP CAN TAKE A MINUTE OR TWO!
tr4w = TextRank4Keyword(nlp)
tr4w.analyze(content, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
# *************************************************************

### Get the strongest phrases in the text file's contents:


In [22]:
output_textrank_phrases = os.path.abspath(os.path.join(dirname, "..", "textrank_phrases.tsv"))
# examine the top-ranked phrases in the document
with open(output_textrank_phrases, 'w') as f:
    print("Rank\tCount\tPhrase\n")
    f.write("Rank\tCount\tPhrase\n")
    phrases = tr4w.get_phrases(10)
    for phrase in phrases:
        print("{:.4f}\t{:5d}\t{}".format(phrase.rank, phrase.count, phrase.text))
        f.write("{:.4f}\t{:5d}\t{}\n".format(phrase.rank, phrase.count, phrase.text))

Rank	Count	Phrase

0.0651	    5	illegal fishing vessels
0.0640	    1	illegal chinese fishing vessels
0.0631	    5	illegal fishing boats
0.0628	    3	illegal foreign fishing vessels
0.0622	    1	illegal chinese fishing boats
0.0614	    2	suspected illegal fishing vessels
0.0612	   26	fishing vessels
0.0609	    1	china’s illegal fishing
0.0608	    1	illegal fishing vessel viking
0.0605	   10	chinese fishing vessels


### Get the strongest keywords in the text file's contents:

In [23]:
output_textrank_keywords = os.path.abspath(os.path.join(dirname, "..", "textrank_keywords.tsv"))
with open(output_textrank_keywords, 'w') as f:
    keywords = tr4w.get_keywords(100)
    print("Rank\t\t\tKeyword")
    f.write("Rank\tKeyword\n")
    for i, (k, v) in enumerate(keywords.items()):
        if i < 10:
            print(f"{v}\t{k}")
        f.write(f"{v}\t{k}\n")


Rank			Keyword
50.85208112995197	fishing
26.690703891139275	fishermen
25.933127678317764	waters
24.640941831954592	vessels
20.62912721198954	vessel
19.78853589003089	sea
19.75132970907416	police
19.272495625592633	crew
18.890463809815717	boats
18.717244583256363	boat


### Get the strongest sentences in the text file's contents to form summary:

In [24]:
output_textrank_sentences = os.path.abspath(os.path.join(dirname, "..", "textrank_sentences.tsv"))
with open(output_textrank_sentences, 'w') as f:
    sentences = tr4w.get_sentences(5)
    print(len(sentences))
    # print("Rank\t\t\tSentence")
    # f.write("Rank\tSentence\n")
    # for sentence in sentences:
    #     print(sentence.chunks)
        # print("{:.4f}\t{}".format(sentence.rank,  sentence.text))
        # f.write("{:.4f}\t{}\n".format(sentence.rank, sentence.text))


Rank			Sentence


AttributeError: 'spacy.tokens.span.Span' object has no attribute 'chunks'