In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from nltk.corpus import gutenberg
import text_normalizer as tn
import nltk
from operator import itemgetter

In [11]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
# load corpus
alice = gutenberg.sents(fileids='carroll-alice.txt')
alice = [' '.join(ts) for ts in alice]
norm_alice = list(filter(None, tn.normalize_corpus(alice, text_lemmatization=False)))

In [15]:
alice

["[ Alice ' s Adventures in Wonderland by Lewis Carroll 1865 ]",
 'CHAPTER I .',
 'Down the Rabbit - Hole',
 "Alice was beginning to get very tired of sitting by her sister on the bank , and of having nothing to do : once or twice she had peeped into the book her sister was reading , but it had no pictures or conversations in it , ' and what is the use of a book ,' thought Alice ' without pictures or conversation ?'",
 'So she was considering in her own mind ( as well as she could , for the hot day made her feel very sleepy and stupid ), whether the pleasure of making a daisy - chain would be worth the trouble of getting up and picking the daisies , when suddenly a White Rabbit with pink eyes ran close by her .',
 "There was nothing so VERY remarkable in that ; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself , ' Oh dear !",
 'Oh dear !',
 "I shall be late !'",
 '( when she thought it over afterwards , it occurred to her that she ought to have wondere

In [16]:
print(alice[0], '\n', norm_alice[0])

[ Alice ' s Adventures in Wonderland by Lewis Carroll 1865 ] 
 alice adventures wonderland lewis carroll


In [None]:
## collocations


In [17]:
def compute_ngrams(sequence, n):
    return list(
            zip(*(sequence[index:]
                     for index in range(n)))
    )

In [18]:
compute_ngrams([1,2,3,4], 2)

[(1, 2), (2, 3), (3, 4)]

In [19]:
compute_ngrams([1,2,3,4], 3)

[(1, 2, 3), (2, 3, 4)]

In [20]:
def flatten_corpus(corpus):
    return ' '.join([document.strip()
                     for document in corpus])

In [21]:
##N- grams

In [22]:
def get_top_ngrams(corpus, ngram_val=1, limit=5):

    corpus = flatten_corpus(corpus)
    tokens = nltk.word_tokenize(corpus)

    ngrams = compute_ngrams(tokens, ngram_val)
    ngrams_freq_dist = nltk.FreqDist(ngrams)
    sorted_ngrams_fd = sorted(ngrams_freq_dist.items(),
                              key=itemgetter(1), reverse=True)
    sorted_ngrams = sorted_ngrams_fd[0:limit]
    sorted_ngrams = [(' '.join(text), freq)
                     for text, freq in sorted_ngrams]

    return sorted_ngrams

In [26]:
get_top_ngrams(corpus=norm_alice, ngram_val=2,
               limit=10)

[('said alice', 123),
 ('mock turtle', 56),
 ('march hare', 31),
 ('said king', 29),
 ('thought alice', 26),
 ('white rabbit', 22),
 ('said hatter', 22),
 ('said mock', 20),
 ('said caterpillar', 18),
 ('said gryphon', 18)]

In [27]:
get_top_ngrams(corpus=norm_alice, ngram_val=3,
               limit=10)

[('said mock turtle', 20),
 ('said march hare', 10),
 ('poor little thing', 6),
 ('little golden key', 5),
 ('certainly said alice', 5),
 ('white kid gloves', 5),
 ('march hare said', 5),
 ('mock turtle said', 5),
 ('know said alice', 4),
 ('might well say', 4)]

In [29]:
data = open('elephants.txt', 'r+').readlines()
sentences = nltk.sent_tokenize(data[0])
len(sentences)

29

In [30]:
data

['Elephants are large mammals of the family Elephantidae and the order Proboscidea. Three species are currently recognised: the African bush elephant (Loxodonta africana), the African forest elephant (L. cyclotis), and the Asian elephant (Elephas maximus). Elephants are scattered throughout sub-Saharan Africa, South Asia, and Southeast Asia. Elephantidae is the only surviving family of the order Proboscidea; other, now extinct, members of the order include deinotheres, gomphotheres, mammoths, and mastodons. All elephants have several distinctive features, the most notable of which is a long trunk (also called a proboscis), used for many purposes, particularly breathing, lifting water, and grasping objects. Their incisors grow into tusks, which can serve as weapons and as tools for moving objects and digging. Elephants\' large ear flaps help to control their body temperature. Their pillar-like legs can carry their great weight. African elephants have larger ears and concave backs while 

In [31]:
sentences[:3]

['Elephants are large mammals of the family Elephantidae and the order Proboscidea.',
 'Three species are currently recognised: the African bush elephant (Loxodonta africana), the African forest elephant (L. cyclotis), and the Asian elephant (Elephas maximus).',
 'Elephants are scattered throughout sub-Saharan Africa, South Asia, and Southeast Asia.']

In [32]:
norm_sentences = tn.normalize_corpus(sentences, text_lower_case=False,
                                     text_stemming=False, text_lemmatization=False, stopword_removal=False)
norm_sentences[:3]

['Elephants are large mammals of the family Elephantidae and the order Proboscidea',
 'Three species are currently recognised the African bush elephant Loxodonta africana the African forest elephant L cyclotis and the Asian elephant Elephas maximus',
 'Elephants are scattered throughout subSaharan Africa South Asia and Southeast Asia']

In [33]:
import itertools
stopwords = nltk.corpus.stopwords.words('english')

def get_chunks(sentences, grammar = r'NP: {<DT>? <JJ>* <NN.*>+}', stopword_list=stopwords):

    all_chunks = []
    chunker = nltk.chunk.regexp.RegexpParser(grammar)

    for sentence in sentences:

        tagged_sents = [nltk.pos_tag(nltk.word_tokenize(sentence))]

        chunks = [chunker.parse(tagged_sent)
                      for tagged_sent in tagged_sents]

        wtc_sents = [nltk.chunk.tree2conlltags(chunk)
                         for chunk in chunks]

        flattened_chunks = list(
                            itertools.chain.from_iterable(
                                wtc_sent for wtc_sent in wtc_sents)
                           )

        valid_chunks_tagged = [(status, [wtc for wtc in chunk])
                                   for status, chunk
                                       in itertools.groupby(flattened_chunks,
                                                lambda word_pos_chunk: word_pos_chunk[2] != 'O')]

        valid_chunks = [' '.join(word.lower()
                                for word, tag, chunk in wtc_group
                                    if word.lower() not in stopword_list)
                                        for status, wtc_group in valid_chunks_tagged
                                            if status]

        all_chunks.append(valid_chunks)

    return all_chunks

In [35]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [36]:

chunks = get_chunks(norm_sentences)
chunks

[['elephants', 'large mammals', 'family elephantidae', 'order proboscidea'],
 ['species',
  'african bush elephant loxodonta',
  'african forest elephant l cyclotis',
  'asian elephant elephas maximus'],
 ['elephants', 'subsaharan africa south asia', 'southeast asia'],
 ['elephantidae',
  'family',
  'order proboscidea',
  'extinct members',
  'order',
  'deinotheres gomphotheres mammoths',
  'mastodons'],
 ['elephants',
  'several distinctive features',
  'long trunk',
  'proboscis',
  'many purposes',
  'water',
  'grasping objects'],
 ['incisors', 'tusks', 'weapons', 'tools', 'objects'],
 ['elephants', 'flaps', 'body temperature'],
 ['pillarlike legs', 'great weight'],
 ['african elephants',
  'ears',
  'backs',
  'asian elephants',
  'ears',
  'convex',
  'level backs'],
 ['elephants', 'different habitats', 'savannahs forests deserts', 'marshes'],
 ['water'],
 ['keystone species', 'impact', 'environments'],
 ['animals',
  'distance',
  'elephants',
  'predators',
  'lions tigers hy

In [37]:
from gensim import corpora, models

def get_tfidf_weighted_keyphrases(sentences,
                                  grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
                                  top_n=10):

    valid_chunks = get_chunks(sentences, grammar=grammar)

    dictionary = corpora.Dictionary(valid_chunks)
    corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]

    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    weighted_phrases = {dictionary.get(idx): value
                           for doc in corpus_tfidf
                               for idx, value in doc}

    weighted_phrases = sorted(weighted_phrases.items(),
                              key=itemgetter(1), reverse=True)
    weighted_phrases = [(term, round(wt, 3)) for term, wt in weighted_phrases]

    return weighted_phrases[:top_n]

In [38]:
get_tfidf_weighted_keyphrases(sentences=norm_sentences, top_n=30)

[('water', 1.0),
 ('asia', 0.807),
 ('wild', 0.764),
 ('great weight', 0.707),
 ('pillarlike legs', 0.707),
 ('southeast asia', 0.693),
 ('subsaharan africa south asia', 0.693),
 ('body temperature', 0.693),
 ('flaps', 0.693),
 ('fissionfusion society', 0.693),
 ('multiple family groups', 0.693),
 ('art folklore religion literature', 0.693),
 ('popular culture', 0.693),
 ('ears', 0.681),
 ('males', 0.653),
 ('males bulls', 0.653),
 ('family elephantidae', 0.607),
 ('large mammals', 0.607),
 ('years', 0.607),
 ('environments', 0.577),
 ('impact', 0.577),
 ('keystone species', 0.577),
 ('cetaceans', 0.577),
 ('elephant intelligence', 0.577),
 ('primates', 0.577),
 ('dead individuals', 0.577),
 ('kind', 0.577),
 ('selfawareness', 0.577),
 ('different habitats', 0.57),
 ('marshes', 0.57)]

In [45]:
import gensim

In [43]:
! pip install gensim



In [49]:
from gensim.summarization import keywords

key_words = keywords(data[0], ratio=1.0, scores=True, lemmatize=True)
[(item, round(score, 3)) for item, score in key_words][:25]

ImportError: ignored

In [50]:

!pip install "gensim==3.8.3"

Collecting gensim==3.8.3
  Downloading gensim-3.8.3.tar.gz (23.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.4/23.4 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gensim
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for gensim (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for gensim[0m[31m
[0m[?25h  Running setup.py clean for gensim
Failed to build gensim
[31mERROR: Could not build wheels for gensim, which is required to install pyproject.toml-based projects[0m[31m
[0m