In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!pip install matplotlib
!python -m spacy download en_core_web_sm

#Recommended to run on colab 

# Explore and Prepare Text File

In [None]:
import re

In [None]:
with open('/content/ComputerScience.txt') as f:
    txt = f.read()

In [None]:
f.close()

In [None]:
txt

In [None]:
txt = txt.lower()

In [None]:
txt

In [None]:
txt = re.sub(r'\n','',txt)

In [None]:
txt

In [None]:
txt = re.sub(r'-','',txt)

In [None]:
txt

In [None]:
txt = re.sub(r'"','',txt)

In [None]:
txt

In [None]:
txt = re.sub(r"[0-9]","",txt)

In [None]:
txt

In [None]:
txt = re.sub(r"'",'',txt)

In [None]:
txt

In [None]:
txt = re.sub(r";",'',txt)

In [None]:
txt

# Models

In [None]:
All_top = []

# Unsupervised

In [None]:
import pke

## Graph Based Models

### TopicRank

**NOTES on TopicRank**:
* unsupervised graph-based ranking model to keyphrase extraction
* uses a random walk algorithm -> to estimate the importance of each topic (node)

In [None]:
# initialize a TopicRank keyphrase extraction model
extractor = pke.unsupervised.TopicRank()

In [None]:
extractor.load_document(input=txt, language='en') # used to pre-process the text (sentence splitting, tokenization, Part-of-Speech tagging, stemming).

In [None]:
extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

In [None]:
extractor.candidate_selection()  #identifying keyphrase candidates

 In **TopicRank**, candidate weighting is a three-step process:
1. candidate clustering (grouping keyphrase candidates into topics)
2. graph construction (building a complete-weighted-graph of topics)
3. rank topics (nodes) using a random walk algorithm


In [None]:
extractor.candidate_weighting()

In [None]:
# Get the N-best candidates (here, 5) as keyphrases
keyphrases = extractor.get_n_best(n=20, stemming=False)

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

### MultipartiteRank Model

In [None]:
extractor = pke.unsupervised.MultipartiteRank()

extractor.load_document(input=txt, language='en')

extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

extractor.grammar_selection()

extractor.candidate_weighting()

keyphrases = extractor.get_n_best(n=20)


In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

### TopicalPageRank Model

In [None]:
extractor = pke.unsupervised.TopicalPageRank()

# 2. load the content of the document.
extractor.load_document(input=txt,
                        language='en')

extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

# 3. select the noun phrases as keyphrase candidates.
extractor.candidate_selection()

# 4. weight the keyphrase candidates using Single Topical PageRank.
#    Builds a word-graph in which edges connecting two words occurring
#    in a window are weighted by co-occurrence counts.
extractor.candidate_weighting()

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=20)

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

## Statistical models

### FirstPhrases

In [None]:
 # 1. create a FirstPhrases baseline extractor.
extractor = pke.unsupervised.FirstPhrases()

# 2. load the content of the document.
extractor.load_document(input=txt,language='en')

extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

# 3. select the longest sequences of nouns and adjectives as candidates.
extractor.candidate_selection()

# 4. weight the candidates using their position
extractor.candidate_weighting()

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=20)

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

### TF-IDF

In [None]:
extractor = pke.unsupervised.TfIdf()        # initialize a keyphrase extraction model, here TFxIDF

extractor.load_document(input=txt)       # load the content of the document (str or spacy Doc)

extractor.grammar_selection(grammar="NP: {<ADJ>*<NOUN|PROPN>+}")

extractor.candidate_selection()             # identify keyphrase candidates

extractor.candidate_weighting()             # weight keyphrase candidates

keyphrases = extractor.get_n_best(n=20)      # select the 5-best candidates as keyphrases

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

### KPMiner Model

In [None]:
# 1. create a KPMiner extractor.
extractor = pke.unsupervised.KPMiner()

# 2. load the content of the document.
extractor.load_document(input=txt,language='en')

# 3. select {1-5}-grams that do not contain punctuation marks or
#    stopwords as keyphrase candidates. Set the least allowable seen
#    frequency to 5 and the number of words after which candidates are
#    filtered out to 200.
lasf = 5
cutoff = 200
extractor.candidate_selection(lasf=lasf, cutoff=cutoff)

# 4. weight the candidates using KPMiner weighting function.
#df = pke.load_document_frequency_file(input_file="path/to/df.tsv.gz")

#alpha = 2.3
#sigma = 3.0
# df=df, alpha=alpha, sigma=sigma

extractor.candidate_weighting()

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=20)


In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

# Supervised



### Kea

In [None]:
# 1. create a Kea extractor.
extractor = pke.supervised.Kea()

# 2. load the content of the document.
stoplist = pke.lang.stopwords.get('en')
extractor.load_document(input=txt, language='en')

# 3. select 1-3 grams that do not start or end with a stopword as
#    candidates. Candidates that contain punctuation marks as words
#    are discarded.
extractor.candidate_selection()

# 4. classify candidates as keyphrase or not keyphrase.
#df = pke.load_document_frequency_file(input_file='path/to/df.tsv.gz')
#model_file = 'path/to/kea_model'
#model_file=model_file,df=df
extractor.candidate_weighting()

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=20)

In [None]:
All_top.extend(keyphrases)

In [None]:
All_top

# Compare

In [None]:
import numpy as np

In [None]:
x = np.array(All_top)
u = np.unique(x)

In [None]:
z = u[118:]
z

In [None]:
l = []
for i in z:
  cnt = 0
  for ii in range(len(All_top)):
    if i == All_top[ii][0]:
      cnt += 1
  if cnt >=2:
    l.append(i)       

In [None]:
l

In [None]:
import nltk
nltk.download('punkt')
  
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor

def tokenize_sentences(text):
    sentences = [sent_tokenize(text)]
    sentences = [y for x in sentences for y in x]
    # Remove any short sentences less than 20 letters.
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 20]
    return sentences

def get_sentences_for_keyword(keywords, sentences):
    keyword_processor = KeywordProcessor()
    keyword_sentences = {}
    for word in keywords:
        keyword_sentences[word] = []
        keyword_processor.add_keyword(word)
    for sentence in sentences:
        keywords_found = keyword_processor.extract_keywords(sentence)
        for key in keywords_found:
            keyword_sentences[key].append(sentence)

    for key in keyword_sentences.keys():
        values = keyword_sentences[key]
        values = sorted(values, key=len, reverse=True)
        keyword_sentences[key] = values
    return keyword_sentences

sentences = tokenize_sentences(txt)
keyword_sentence_mapping = get_sentences_for_keyword(l, sentences)
        

In [None]:
keyword_sentence_mapping