In [None]:
!pip install --quiet flashtext
!pip install --quiet git+https://github.com/boudinfl/pke.git

In [12]:
import textwrap
import json
import requests
import string
import re
import nltk
import string
import itertools

import pke
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import traceback
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor
from pprint import pprint

from IPython.core.display import display, HTML
import xml.etree.ElementTree as et
import random
from xml.dom import minidom

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
text = """There is a lot of volcanic activity at divergent plate boundaries in the oceans. For example, many undersea volcanoes are found along the Mid-Atlantic Ridge. This is a divergent plate boundary that runs north-south through the middle of the Atlantic Ocean. As tectonic plates pull away from each other at a divergent plate boundary, they create deep fissures, or cracks, in the crust. Molten rock, called magma, erupts through these cracks onto Earth’s surface. At the surface, the molten rock is called lava. It cools and hardens, forming rock. Divergent plate boundaries also occur in the continental crust. Volcanoes form at these boundaries, but less often than in ocean crust. That’s because continental crust is thicker than oceanic crust. This makes it more difficult for molten rock to push up through the crust. Many volcanoes form along convergent plate boundaries where one tectonic plate is pulled down beneath another at a subduction zone. The leading edge of the plate melts as it is pulled into the mantle, forming magma that erupts as volcanoes. When a line of volcanoes forms along a subduction zone, they make up a volcanic arc. The edges of the Pacific plate are long subduction zones lined with volcanoes. This is why the Pacific rim is called the “Pacific Ring of Fire.”"""

wrapper = textwrap.TextWrapper(width=150)
word_list = wrapper.wrap(text=text)
for element in word_list:
  print(element)

There is a lot of volcanic activity at divergent plate boundaries in the oceans. For example, many undersea volcanoes are found along the Mid-Atlantic
Ridge. This is a divergent plate boundary that runs north-south through the middle of the Atlantic Ocean. As tectonic plates pull away from each other
at a divergent plate boundary, they create deep fissures, or cracks, in the crust. Molten rock, called magma, erupts through these cracks onto Earth’s
surface. At the surface, the molten rock is called lava. It cools and hardens, forming rock. Divergent plate boundaries also occur in the continental
crust. Volcanoes form at these boundaries, but less often than in ocean crust. That’s because continental crust is thicker than oceanic crust. This
makes it more difficult for molten rock to push up through the crust. Many volcanoes form along convergent plate boundaries where one tectonic plate
is pulled down beneath another at a subduction zone. The leading edge of the plate melts as it is pu

In [5]:
def tokenize_sentences(text):
    sentences = sent_tokenize(text)
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

sentences = tokenize_sentences(text)
print (sentences)

['There is a lot of volcanic activity at divergent plate boundaries in the oceans.', 'For example, many undersea volcanoes are found along the Mid-Atlantic Ridge.', 'This is a divergent plate boundary that runs north-south through the middle of the Atlantic Ocean.', 'As tectonic plates pull away from each other at a divergent plate boundary, they create deep fissures, or cracks, in the crust.', 'Molten rock, called magma, erupts through these cracks onto Earth’s surface.', 'At the surface, the molten rock is called lava.', 'It cools and hardens, forming rock.', 'Divergent plate boundaries also occur in the continental crust.', 'Volcanoes form at these boundaries, but less often than in ocean crust.', 'That’s because continental crust is thicker than oceanic crust.', 'This makes it more difficult for molten rock to push up through the crust.', 'Many volcanoes form along convergent plate boundaries where one tectonic plate is pulled down beneath another at a subduction zone.', 'The leadi

In [6]:
def get_noun_adj_verb(text):
    out=[]
    try:
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=text, language='en')

        #    not contain punctuation marks or stopwords as candidates.
        pos = {'VERB', 'ADJ', 'NOUN'}
        stoplist = list(string.punctuation)
        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        stoplist += stopwords.words('english')

        # extractor.candidate_selection(pos=pos, stoplist=stoplist)
        extractor.candidate_selection(pos=pos)

        # 4. build the Multipartite graph and rank candidates using random walk,
        #    alpha controls the weight adjustment mechanism, see TopicRank for
        #    threshold/method parameters.
        extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
        keyphrases = extractor.get_n_best(n=30)


        for val in keyphrases:
            out.append(val[0])
    except:
        out = []
        traceback.print_exc()

    return out

noun_verbs_adj = get_noun_adj_verb(text)
print ("keywords:", noun_verbs_adj)

keywords: ['divergent plate boundaries', 'crust', 'molten rock', 'tectonic plates pull', 'volcanoes form', 'oceans', 'called magma', 'forming rock', 'erupts', 'cracks', 'volcanic activity', 'subduction zone', 'surface', 'continental crust', 'ocean crust', 'makes', 'pulled', 'leading edge', 'hardens', 'example', 'runs north-south', 'cools', 'create deep fissures', 'volcanoes', 'occur', 'lot', 'called lava', 'middle', 'found', 'push']


In [8]:
def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}

    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)

    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values

    return keyword_sentences

keyword_sentence_mapping_noun_verbs_adj = get_sentences_for_keyword(noun_verbs_adj, sentences)
pprint (keyword_sentence_mapping_noun_verbs_adj)

{'called lava': ['At the surface, the molten rock is called lava.'],
 'called magma': ['Molten rock, called magma, erupts through these cracks onto '
                  'Earth’s surface.'],
 'continental crust': ['Divergent plate boundaries also occur in the '
                       'continental crust.',
                       'That’s because continental crust is thicker than '
                       'oceanic crust.'],
 'cools': ['It cools and hardens, forming rock.'],
 'cracks': ['As tectonic plates pull away from each other at a divergent plate '
            'boundary, they create deep fissures, or cracks, in the crust.',
            'Molten rock, called magma, erupts through these cracks onto '
            'Earth’s surface.'],
 'create deep fissures': ['As tectonic plates pull away from each other at a '
                          'divergent plate boundary, they create deep '
                          'fissures, or cracks, in the crust.'],
 'crust': ['As tectonic plates pull away from

In [9]:
def get_fill_in_the_blanks(sentence_mapping):

    out={"title":"Fill in the blanks for these sentences with matching words at the top"}
    blank_sentences = []
    processed = []
    keys=[]

    for key in sentence_mapping:

        if len(sentence_mapping[key])>0:
            sent = sentence_mapping[key][0]
            # Compile a regular expression pattern into a regular expression object, which can be used for matching and other methods
            insensitive_sent = re.compile(re.escape(key), re.IGNORECASE)
            no_of_replacements =  len(re.findall(re.escape(key), sent, re.IGNORECASE))
            line = insensitive_sent.sub(' _________ ', sent)

            if (sentence_mapping[key][0] not in processed) and no_of_replacements<2:
                blank_sentences.append(line)
                processed.append(sentence_mapping[key][0])
                keys.append(key)

    out["sentences"] = blank_sentences[:10]
    out["keys"] = keys[:10]

    return out


fill_in_the_blanks = get_fill_in_the_blanks(keyword_sentence_mapping_noun_verbs_adj)
pprint(fill_in_the_blanks)

{'keys': ['divergent plate boundaries',
          'crust',
          'molten rock',
          'volcanoes form',
          'forming rock',
          'erupts',
          'continental crust',
          'ocean crust',
          'makes',
          'example'],
 'sentences': ['There is a lot of volcanic activity at  _________  in the '
               'oceans.',
               'As tectonic plates pull away from each other at a divergent '
               'plate boundary, they create deep fissures, or cracks, in the  '
               '_________ .',
               ' _________ , called magma, erupts through these cracks onto '
               'Earth’s surface.',
               'Many  _________  along convergent plate boundaries where one '
               'tectonic plate is pulled down beneath another at a subduction '
               'zone.',
               'It cools and hardens,  _________ .',
               'The leading edge of the plate melts as it is pulled into the '
               'mantle, for

In [11]:
root = et.Element("div")

heading = et.Element("h2")
heading.text = fill_in_the_blanks['title']

keywords = et.Element("ul")
keywords.set('style', 'color:blue;')

all_keys = fill_in_the_blanks['keys']
random.shuffle(all_keys)

for blank in all_keys:
  child=et.Element("li")
  child.text = blank
  keywords.append(child)

sentences = et.Element("ol")
sentences.set('style', 'color:brown;')

for sentence in fill_in_the_blanks['sentences']:
  child=et.Element("li")
  child.text = sentence
  sentences.append(child)
  sentences.append(et.Element("br"))

heading_content = et.Element("h4")

root.append(heading)
heading_content.append(keywords)
heading_content.append(sentences)
root.append(heading_content)

xmlstr = et.tostring(root)
xmlstr = xmlstr.decode("utf-8")
display(HTML(xmlstr))

In [13]:
prettyxmlstr = minidom.parseString(et.tostring(root)).toprettyxml(indent="   ")
print(prettyxmlstr)

<?xml version="1.0" ?>
<div>
   <h2>Fill in the blanks for these sentences with matching words at the top</h2>
   <h4>
      <ul style="color:blue;">
         <li>volcanoes form</li>
         <li>continental crust</li>
         <li>molten rock</li>
         <li>forming rock</li>
         <li>ocean crust</li>
         <li>makes</li>
         <li>crust</li>
         <li>erupts</li>
         <li>divergent plate boundaries</li>
         <li>example</li>
      </ul>
      <ol style="color:brown;">
         <li>There is a lot of volcanic activity at  _________  in the oceans.</li>
         <br/>
         <li>As tectonic plates pull away from each other at a divergent plate boundary, they create deep fissures, or cracks, in the  _________ .</li>
         <br/>
         <li> _________ , called magma, erupts through these cracks onto Earth’s surface.</li>
         <br/>
         <li>Many  _________  along convergent plate boundaries where one tectonic plate is pulled down beneath another at a s