## Unsupervised Keyword Extraction: PKE, RAKE, YAKE

Author: Sumaia Parveen Shupti

Created on: 07/14/2021

Updated on: 08/02/2021

In [None]:
#pip install keybert
#!pip install flair
#!pip install wikipedia
#!pip install git+https://github.com/boudinfl/pke.git
#!pip install rake-nltk==1.0.4
#!pip install git+https://github.com/LIAAD/yake@f35bbc0e2f47e733233a182f69ea3aed4a82f9a5

In [None]:
import wikipedia
import pandas as pd
 
# Get wiki content.
wikisearch = wikipedia.page("UFO")
wikicontent = wikisearch.content
wikicontent



In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### PKE: TextRank

https://boudinfl.github.io/pke/build/html/unsupervised.html#graph-based-models

In [None]:
def textrank(raw_text, no_kw):    
    
    import pke

    # define the set of valid Part-of-Speeches
    pos = {'NOUN', 'PROPN', 'ADJ'}

    # 1. create a TextRank extractor.
    extractor = pke.unsupervised.TextRank()

    # 2. load the content of the document.
    extractor.load_document(input=raw_text,
                            language='en',
                            normalization=None)

    # 3. build the graph representation of the document and rank the words.
    extractor.candidate_weighting(window=100, #the window for connecting two words in the graph
                                  pos=pos,
                                  top_percent=0.33)

    # 4. get the 10-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=int(no_kw))


    kw = []
    weight = []

    for i in range(len(keyphrases)):
        a = keyphrases[i][0]
        kw.append(a.replace('=', ' ').strip())
        weight.append(keyphrases[i][1])

    df = pd.DataFrame()    
    df['Kewrord'] = kw
    df['Weight'] = weight
    df = df.sort_values(by ='Weight', ascending = False)
    df = df.head(int(no_kw))       
    df = df.reset_index(drop = True)
    
    return df

In [None]:
%time textrank(wikicontent, 7)



CPU times: user 3.24 s, sys: 296 ms, total: 3.54 s
Wall time: 3.56 s


Unnamed: 0,Kewrord,Weight
0,secret usaf project blue book special report no .,0.018755
1,official government ufo - investigation agency .,0.018352
2,official air force ufo investigations .,0.017834
3,- public u.s. government ufo investigations,0.017402
4,project blue book project head edward j. ruppelt,0.016779
5,united states,0.015836
6,usaf project blue book cases .,0.014768


### PKE: SingleRank

https://boudinfl.github.io/pke/build/html/unsupervised.html#graph-based-models

In [None]:
def singlerank(raw_text, no_kw):    
    
    import pke

    # define the set of valid Part-of-Speeches
    pos = {'NOUN', 'PROPN', 'ADJ'}

    # 1. create a SingleRank extractor.
    extractor = pke.unsupervised.SingleRank()

    # 2. load the content of the document.
    extractor.load_document(input=raw_text,
                            language='en',
                            normalization=None)

    # 3. select the longest sequences of nouns and adjectives as candidates.
    extractor.candidate_selection(pos=pos)

    # 4. weight the candidates using the sum of their word's scores that are
    extractor.candidate_weighting(window=100,pos=pos)

    # 4. get the 10-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=int(no_kw))


    kw = []
    weight = []

    for i in range(len(keyphrases)):
        a = keyphrases[i][0]
        kw.append(a.replace('=', ' ').strip())
        weight.append(keyphrases[i][1])

    df = pd.DataFrame()    
    df['Kewrord'] = kw
    df['Weight'] = weight
    df = df.sort_values(by ='Weight', ascending = False)
    df = df.head(int(no_kw))       
    df = df.reset_index(drop = True)
    
    return df

In [None]:
%time singlerank(wikicontent, 7)

CPU times: user 2.67 s, sys: 83.7 ms, total: 2.75 s
Wall time: 2.76 s


Unnamed: 0,Kewrord,Weight
0,non - public u.s. government ufo investigations,0.036946
1,official air force ufo investigations,0.035122
2,secret usaf project blue book special report no .,0.033837
3,other ufo phenomena,0.030415
4,project blue book project head edward j. ruppelt,0.030004
5,pilot ufo sightings,0.028226
6,cabinet rs/33.alleged ufo sightings,0.027427


### PKE: TopicRank

https://boudinfl.github.io/pke/build/html/unsupervised.html#graph-based-models

In [None]:
def topicrank(raw_text, no_kw):    
    
    import pke
    import string
    from nltk.corpus import stopwords

    # 1. create a TopicRank extractor.
    extractor = pke.unsupervised.TopicRank()

    # 2. load the content of the document.
    extractor.load_document(input=raw_text)

    # 3. select the longest sequences of nouns and adjectives, that do
    #    not contain punctuation marks or stopwords as candidates.
    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    extractor.candidate_selection(pos=pos, stoplist=stoplist)

    # 4. build topics by grouping candidates with HAC (average linkage,
    #    threshold of 1/4 of shared stems). Weight the topics using random
    #    walk, and select the first occuring candidate from each topic.
    extractor.candidate_weighting(threshold=0.74, method='average')

    # 5. get the highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=int(no_kw))


    kw = []
    weight = []

    for i in range(len(keyphrases)):
        a = keyphrases[i][0]
        kw.append(a.replace('=', ' ').strip())
        weight.append(keyphrases[i][1])

    df = pd.DataFrame()    
    df['Kewrord'] = kw
    df['Weight'] = weight
    df = df.sort_values(by ='Weight', ascending = False)
    df = df.head(int(no_kw))       
    df = df.reset_index(drop = True)
    
    return df

In [None]:
%time topicrank(wikicontent, 7)

CPU times: user 18.8 s, sys: 267 ms, total: 19.1 s
Wall time: 19.1 s


Unnamed: 0,Kewrord,Weight
0,ufo,0.026728
1,ufo reports,0.016138
2,mass sightings,0.013505
3,conventional objects,0.011398
4,investigations,0.010903
5,project blue book,0.008147
6,royal air force,0.007616


### PKE: PositionRank

https://boudinfl.github.io/pke/build/html/unsupervised.html#graph-based-models

In [None]:
def positionrank(raw_text, no_kw):    
    
    import pke

    # define the valid Part-of-Speeches to occur in the graph
    pos = {'NOUN', 'PROPN', 'ADJ'}

    # define the grammar for selecting the keyphrase candidates
    grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

    # 1. create a PositionRank extractor.
    extractor = pke.unsupervised.PositionRank()

    # 2. load the content of the document.
    extractor.load_document(input=raw_text,
                            language='en',
                            normalization=None)

    # 3. select the noun phrases up to 3 words as keyphrase candidates.
    extractor.candidate_selection(grammar=grammar,
                                  maximum_word_number=3)

    # 4. weight the candidates using the sum of their word's scores that are
    #    computed using random walk biaised with the position of the words
    #    in the document. In the graph, nodes are words (nouns and
    #    adjectives only) that are connected if they occur in a window of
    #    10 words.
    extractor.candidate_weighting(window=100,
                                  pos=pos)

    # 4. get the 10-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=int(no_kw))


    kw = []
    weight = []

    for i in range(len(keyphrases)):
        a = keyphrases[i][0]
        kw.append(a.replace('=', ' ').strip())
        weight.append(keyphrases[i][1])

    df = pd.DataFrame()    
    df['Kewrord'] = kw
    df['Weight'] = weight
    df = df.sort_values(by ='Weight', ascending = False)
    df = df.head(int(no_kw))       
    df = df.reset_index(drop = True)
    
    return df

In [None]:
%time positionrank(wikicontent, 7)

CPU times: user 4.61 s, sys: 131 ms, total: 4.74 s
Wall time: 4.75 s


Unnamed: 0,Kewrord,Weight
0,unidentified flying object,0.055587
1,unidentified flying objects,0.049809
2,other ufo phenomena,0.042286
3,unidentified aerial objects,0.041767
4,unidentified aerial phenomena,0.040517
5,unidentified aerial phenomenon,0.039209
6,pilot ufo sightings,0.037372


### PKE: MultipartiteRank

https://boudinfl.github.io/pke/build/html/unsupervised.html#graph-based-*models*

In [None]:
def multipartiterank(raw_text, no_kw):    
    
    import pke
    import string
    from nltk.corpus import stopwords

    # 1. create a MultipartiteRank extractor.
    extractor = pke.unsupervised.MultipartiteRank()

    # 2. load the content of the document.
    extractor.load_document(input=raw_text)

    # 3. select the longest sequences of nouns and adjectives, that do
    #    not contain punctuation marks or stopwords as candidates.
    pos = {'NOUN', 'PROPN', 'ADJ'}
    stoplist = list(string.punctuation)
    stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
    stoplist += stopwords.words('english')
    extractor.candidate_selection(pos=pos, stoplist=stoplist)

    # 4. build the Multipartite graph and rank candidates using random walk,
    #    alpha controls the weight adjustment mechanism, see TopicRank for
    #    threshold/method parameters.
    extractor.candidate_weighting(alpha=1.1,
                                  threshold=0.74,
                                  method='average')

    # 4. get the 10-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=int(no_kw))


    kw = []
    weight = []

    for i in range(len(keyphrases)):
        a = keyphrases[i][0]
        kw.append(a.replace('=', ' ').strip())
        weight.append(keyphrases[i][1])

    df = pd.DataFrame()    
    df['Kewrord'] = kw
    df['Weight'] = weight
    df = df.sort_values(by ='Weight', ascending = False)
    df = df.head(int(no_kw))       
    df = df.reset_index(drop = True)
    
    return df

In [None]:
%time multipartiterank(wikicontent, 7)

CPU times: user 36.9 s, sys: 1.91 s, total: 38.8 s
Wall time: 38.8 s


Unnamed: 0,Kewrord,Weight
0,ufo,0.019035
1,ufo reports,0.011733
2,mass sightings,0.010112
3,conventional objects,0.010028
4,investigations,0.007846
5,phenomena,0.006725
6,project blue book,0.006489


### NLTK Rake

https://pypi.org/project/rake-nltk/

In [None]:
def rake(raw_text, no_kw, min_len, max_len):
    
    from rake_nltk import Metric, Rake
    import re
    from nltk.corpus import stopwords

    # To use it with a specific language supported by nltk.
    r = Rake(language=None)

    # If you want to provide your own set of stop words and punctuations to
    r = Rake(
        stopwords=stopwords.words('english'),
        punctuations=None
    )

    # If you want to control the metric for ranking. Paper uses d(w)/f(w) as the
    # metric. You can use this API with the following metrics:
    # 1. d(w)/f(w) (Default metric) Ratio of degree of word to its frequency.
    # 2. d(w) Degree of word only.
    # 3. f(w) Frequency of word only.

    r = Rake(ranking_metric=Metric.DEGREE_TO_FREQUENCY_RATIO)
    r = Rake(ranking_metric=Metric.WORD_DEGREE)
    r = Rake(ranking_metric=Metric.WORD_FREQUENCY)

    # If you want to control the max or min words in a phrase, for it to be
    # considered for ranking you can initialize a Rake instance as below:

    r = Rake(min_length=int(min_len), max_length=int(max_len))

    r.extract_keywords_from_text(raw_text)
    keyphrases = r.get_ranked_phrases_with_scores()[:int(no_kw)]

    kw = []
    weight = []

    for i in range(len(keyphrases)):
        weight.append(keyphrases[i][0])
        kw.append(re.sub(r'\W+', ' ', keyphrases[i][1]))

    df = pd.DataFrame()    
    df['Kewrord'] = kw
    df['Weight'] = weight
    df = df.sort_values(by ='Weight', ascending = False)
    df = df.head(int(no_kw))       
    df = df.reset_index(drop = True)

    return df

In [None]:
rake(wikicontent, 7, 1, 2)

Unnamed: 0,Kewrord,Weight
0,especially,4.0
1,études spatiales,4.0
2,yves sillard,4.0
3,wore suits,4.0
4,without visible,4.0
5,white house,4.0
6,waterloo bridge,4.0


### Yake

https://github.com/LIAAD/yake

In [None]:
def yake(raw_text, no_kw, max_len):
    
    import yake

    keyw =[]
    weight = []

    max_ngram_size = max_len
    numOfKeywords = int(no_kw)

    kw_extractor = yake.KeywordExtractor(n=max_ngram_size, top=numOfKeywords, features=None)
    keywords = kw_extractor.extract_keywords(raw_text)

    for kw in keywords:
        keyw.append(kw[0])

    for kw in keywords:
        weight.append(kw[1])

    df = pd.DataFrame()    
    df['Kewrord'] = keyw
    df['Weight'] = weight
    df = df.sort_values(by ='Weight', ascending = False).reset_index(drop = True)
    df = df.head(int(no_kw))
    
    return df

In [None]:
yake(wikicontent, 7, 2)

Unnamed: 0,Kewrord,Weight
0,Blue Book,0.001086
1,Project Blue,0.001084
2,Air Force,0.001027
3,UFOs,0.00099
4,UFO sightings,0.000875
5,UFO reports,0.000837
6,UFO,0.000616
