In [None]:
! pip install sentence-transformers

SOURCES--
  1. https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
  2. https://maartengr.github.io/KeyBERT/
  3. https://github.com/ibatra/BERT-Keyword-Extractor


for sentence transformers
  1. https://www.sbert.net/docs/package_reference/SentenceTransformer.html

In [None]:
import nltk
nltk.download('punkt')

In [119]:
import numpy as np
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
import re
class Keyword_Extractor:
    def __init__(self,model):
        if isinstance(model,str):
            self.model = SentenceTransformer(model)
        else:
            self.model = model
            
    def get_ngrams(self,text, n_range=(1,1)):
        n_grams = []
        a,b = n_range
        for i in range(a,b+1):
            __ = ngrams(word_tokenize(text), i)
            _ = [' '.join(grams) for grams in __]
            n_grams = n_grams + _
        return n_grams

    def clean_string(self,string):
        # string = string.encode("utf-8")
        # string = BeautifulSoup(string,'lxml').text
        string = string.lower()
        #special_characters = ["!","@","#","$","%","^","&","*","+","=","?","'","{","}","[","]","<",">","~","`",":",";","|",'\n']
        special_characters = ["!","$","%","^","&","*","+","=","?","'","{","}","[","]","<",">","~","`",":",";","|",'\n']
        string = re.sub('http[s]?://\S+', '', string)
        string=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",string)
        for a in special_characters:
            string = string.replace(a,'')
        return string
    
    def find_simillarity_score(self,sentence1,sentence2):
        # encode sentences to get their embeddings
        embedding1 = self.model.encode(sentence1, convert_to_tensor=True)
        embedding2 = self.model.encode(sentence2, convert_to_tensor=True)
        # compute similarity scores of two embeddings
        cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
        return cosine_scores
    
    def extract_keywords(self,sentence,ngram_range=(1,2),top_k=5,
                         do_lower=True,do_cleaning=False):
        if do_lower:
            sentence = sentence.lower()
        if do_cleaning:
            sentence = self.clean_string(sentence)
        n_grams = self.get_ngrams(sentence, n_range=ngram_range)
        n_grams = list(set([a.strip() for a in n_grams]))
        # encode corpus to get corpus embeddings
        corpus_embeddings = self.model.encode(n_grams, convert_to_tensor=True)
        sentence_embedding = self.model.encode(sentence, convert_to_tensor=True)
        # compute similarity scores of the sentence with the corpus
        cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]
        # Sort the results in decreasing order and get the first top_k
        top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
        keywords = [(n_grams[idx],cos_scores[idx].item() )for idx in top_results[0:top_k]]
        return keywords

BEST PERFORMING MODELS FOR THIS TASK ::
  1. base-nli-stsb-mean-tokens
  2. xlm-r-distilroberta-base-paraphase-v1
  3. stsb-roberta-large

In [103]:
# base-nli-stsb-mean-tokens or xlm-r-distilroberta-base-paraphase-v1
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
# model =  SentenceTransformer('stsb-roberta-large')
key_extractor = Keyword_Extractor(model)

In [107]:
text = "Is Rahul Gandhi's scriptwriter from Peking or Pindi, asks BJP MP Rajyavardhan Rathore"
text = "Video: Students Stopped At Second Karnataka College Over Hijab"
key_extractor.extract_keywords(text)

[('karnataka college', 0.473552405834198),
 ('over hijab', 0.4525412321090698),
 ('hijab', 0.44046491384506226),
 ('students stopped', 0.4034881293773651),
 ('karnataka', 0.400024950504303)]

ANOTHER APPROACH 

In [108]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def extract_keywords(doc,model,n_gram_range = (1, 1),stop_words = "english",top_n = 5):
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
    candidates = count.get_feature_names()
    doc_embedding = model.encode([doc])
    candidate_embeddings = model.encode(candidates)
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
    return keywords
keywords = extract_keywords(text,model,n_gram_range = (1, 2),stop_words = "english",top_n = 5)
keywords



['students stopped',
 'video students',
 'hijab',
 'karnataka college',
 'college hijab']

FIND MOST N - SIMILLAR SENTENCES FROM A CORPUS OF SENTENCES

In [105]:
corpus = ["I like Python because I can build AI applications",
          "I like Python because I can do data analytics",
          "The cat sits on the ground",
         "The cat walks on the sidewalk"]
# encode corpus to get corpus embeddings
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
sentence = "I like Javascript because I can build web applications"
# encode sentence to get sentence embeddings
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
# top_k results to return
top_k=2
# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]
# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
print("Sentence:", sentence, "\n")
print("Top", top_k, "most similar sentences in corpus:")
for idx in top_results[0:top_k]:
    print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))

Sentence: I like Javascript because I can build web applications 

Top 2 most similar sentences in corpus:
I like Python because I can build AI applications (Score: 0.8775)
I like Python because I can do data analytics (Score: 0.8344)


TESTING KEYWORD EXTRACTOR ----

scrapping some articles from google news ::--

In [109]:
import requests
from bs4 import BeautifulSoup

In [112]:
topics = ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS', 'HEALTH']
def make_topic_url(lang,topic,hours=3):
    ceid = f"when%3A{hours}h&hl={lang}&gl=IN&ceid=IN:{lang}"
    
    if topic.upper() in topics:
        headlines = f'https://news.google.com/rss/headlines/section/topic/{topic.upper()}?' + ceid
        return headlines
    else:
        return "invalid topic"

def format_article(article):
    data = {}
    data['title'] = article.title.text
    data['link'] = article.link.text
    data['pubDate'] = article.pubDate.text
    data['description'] = article.description.text
    return data

In [115]:
lang,topic,hours='en','BUSINESS',1
topic_url = make_topic_url(lang,topic,hours)
response = BeautifulSoup(requests.get(topic_url).content,'xml')
articles = [format_article(article) for article in response.find_all("item")]
print(len(articles))
articles[0]

69


{'description': '<ol><li><a href="https://news.google.com/__i/rss/rd/articles/CBMicGh0dHBzOi8vd3d3LmxpdmVtaW50LmNvbS9jb21wYW5pZXMvbmV3cy9iYW5rcy1hY3QtYWdhaW5zdC1mdXR1cmUtcmV0YWlsLWZvci1taXNzaW5nLXBheW1lbnRzLTExNjQzODc0MTQ0NDIxLmh0bWzSAQA?oc=5" target="_blank">Banks act against Future Retail for missing payments</a>&nbsp;&nbsp;<font color="#6f6f6f">Mint</font></li><li><a href="https://news.google.com/__i/rss/rd/articles/CBMiowFodHRwczovL3d3dy5tb25leWNvbnRyb2wuY29tL25ld3MvdHJlbmRzL2xlZ2FsLXRyZW5kcy9iYW5rcy1zdGFydC10YWdnaW5nLWZ1dHVyZS1yZXRhaWwtYXMtbnBhLXVyZ2UtYXBleC1jb3VydC10by1hbGxvdy1iaWRkaW5nLWJ5LWFtYXpvbi1hbmQtcmVsaWFuY2UtODAzMjA4MS5odG1s0gGnAWh0dHBzOi8vd3d3Lm1vbmV5Y29udHJvbC5jb20vbmV3cy90cmVuZHMvbGVnYWwtdHJlbmRzL2JhbmtzLXN0YXJ0LXRhZ2dpbmctZnV0dXJlLXJldGFpbC1hcy1ucGEtdXJnZS1hcGV4LWNvdXJ0LXRvLWFsbG93LWJpZGRpbmctYnktYW1hem9uLWFuZC1yZWxpYW5jZS04MDMyMDgxLmh0bWwvYW1w?oc=5" target="_blank">Banks start tagging Future Retail as NPA, urge apex court to allow bidding by Amazon and Reliance</a>&

as we can see the keywods with score greatec than 0.5 are looking good !!!! YAY !! Happy EXtraction

In [118]:
for article in articles:
    text = article['title']
    print(text)
    keywords = key_extractor.extract_keywords(text,do_lower=True)
    print(keywords)
    print("-----------------------------------------------------------------------")

Banks act against Future Retail for missing payments - Mint
[('missing payments', 0.6088423728942871), ('banks act', 0.5037549734115601), ('for missing', 0.4212077856063843), ('act against', 0.402449369430542), ('banks', 0.39925798773765564)]
-----------------------------------------------------------------------
Facebook Owner Meta Set For $195 Billion Wipeout, Biggest In Market History - NDTV
[('billion wipeout', 0.509151816368103), ('facebook owner', 0.4544554650783539), ('195 billion', 0.4217804968357086), ('facebook', 0.28109070658683777), ('$ 195', 0.2676222324371338)]
-----------------------------------------------------------------------
India's Prime Minister Modi: Digital Rupee Will Strengthen Digital Economy, Revolutionize Fintech – Bitcoin News - Bitcoin News
[('revolutionize fintech', 0.6200138926506042), ('strengthen digital', 0.5329804420471191), ('digital economy', 0.5244150161743164), ('bitcoin news', 0.43852460384368896), ('fintech –', 0.3757169544696808)]
-----------

ANOTHER APPROACH ||| SIMPLE BASIC |||


RAKE short for Rapid Automatic Keyword Extraction algorithm, is a domain independent keyword extraction algorithm which tries to determine key phrases in a body of text by analyzing the frequency of word appearance and its co-occurance with other words in the text.

In [None]:
! pip install rake-nltk

In [125]:
import rake_nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [123]:
def extract_keywords_using_rake(string):
    rake = rake_nltk.Rake()
    rake.extract_keywords_from_text(string)
    keywords = rake.get_ranked_phrases()
    return keywords

In [126]:
print(text)
extract_keywords_using_rake(text)

India's sovereign wealth fund invests in FirstCry; SoftBank, others offload stake in secondary deal - Economic Times


['sovereign wealth fund invests',
 'others offload stake',
 'secondary deal',
 'economic times',
 'softbank',
 'india',
 'firstcry']