In [None]:
! pip install sentence-transformers

SOURCES--
  1. https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
  2. https://maartengr.github.io/KeyBERT/
  3. https://github.com/ibatra/BERT-Keyword-Extractor


for sentence transformers
  1. https://www.sbert.net/docs/package_reference/SentenceTransformer.html

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import numpy as np
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer, util
class Keyword_Extractor:
    def __init__(self,model):
        if isinstance(model,str):
            self.model = SentenceTransformer(model)
        else:
            self.model = model
            
    def get_ngrams(self,text, n_range=(1,1)):
        n_grams = []
        a,b = n_range
        for i in range(a,b+1):
            __ = ngrams(word_tokenize(text), i)
            _ = [' '.join(grams) for grams in __]
            n_grams = n_grams + _
        return n_grams
    
    def find_simillarity_score(self,sentence1,sentence2):
        # encode sentences to get their embeddings
        embedding1 = self.model.encode(sentence1, convert_to_tensor=True)
        embedding2 = self.model.encode(sentence2, convert_to_tensor=True)
        # compute similarity scores of two embeddings
        cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
        return cosine_scores
    
    def extract_keywords(self,sentence,ngram_range=(1,2),top_k=5,do_lower=True):
        if do_lower:
            sentence = sentence.lower()
        n_grams = self.get_ngrams(sentence, n_range=ngram_range)
        n_grams = list(set([a.strip() for a in n_grams]))
        # encode corpus to get corpus embeddings
        corpus_embeddings = self.model.encode(n_grams, convert_to_tensor=True)
        sentence_embedding = self.model.encode(sentence, convert_to_tensor=True)
        # compute similarity scores of the sentence with the corpus
        cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]
        # Sort the results in decreasing order and get the first top_k
        top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
        keywords = [(n_grams[idx],cos_scores[idx].item() )for idx in top_results[0:top_k]]
        return keywords

BEST PERFORMING MODELS FOR THIS TASK ::
  1. base-nli-stsb-mean-tokens
  2. xlm-r-distilroberta-base-paraphase-v1
  3. stsb-roberta-large

In [None]:
# base-nli-stsb-mean-tokens or xlm-r-distilroberta-base-paraphase-v1
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
# model =  SentenceTransformer('stsb-roberta-large')
key_extractor = Keyword_Extractor(model)

In [None]:
text = "Is Rahul Gandhi's scriptwriter from Peking or Pindi, asks BJP MP Rajyavardhan Rathore"
text = "Video: Students Stopped At Second Karnataka College Over Hijab"
key_extractor.extract_keywords(text)

ANOTHER APPROACH 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def extract_keywords(doc,model,n_gram_range = (1, 1),stop_words = "english",top_n = 5):
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
    candidates = count.get_feature_names()
    doc_embedding = model.encode([doc])
    candidate_embeddings = model.encode(candidates)
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
    return keywords
keywords = extract_keywords(text,model,n_gram_range = (1, 2),stop_words = "english",top_n = 5)
keywords

FIND MOST N - SIMILLAR SENTENCES FROM A CORPUS OF SENTENCES

In [None]:
corpus = ["I like Python because I can build AI applications",
          "I like Python because I can do data analytics",
          "The cat sits on the ground",
         "The cat walks on the sidewalk"]
# encode corpus to get corpus embeddings
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
sentence = "I like Javascript because I can build web applications"
# encode sentence to get sentence embeddings
sentence_embedding = model.encode(sentence, convert_to_tensor=True)
# top_k results to return
top_k=2
# compute similarity scores of the sentence with the corpus
cos_scores = util.pytorch_cos_sim(sentence_embedding, corpus_embeddings)[0]
# Sort the results in decreasing order and get the first top_k
top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
print("Sentence:", sentence, "\n")
print("Top", top_k, "most similar sentences in corpus:")
for idx in top_results[0:top_k]:
    print(corpus[idx], "(Score: %.4f)" % (cos_scores[idx]))

TESTING KEYWORD EXTRACTOR ----

scrapping some articles from google news ::--

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
topics = ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS', 'HEALTH']
def make_topic_url(lang,topic,hours=3):
    ceid = f"when%3A{hours}h&hl={lang}&gl=IN&ceid=IN:{lang}"
    
    if topic.upper() in topics:
        headlines = f'https://news.google.com/rss/headlines/section/topic/{topic.upper()}?' + ceid
        return headlines
    else:
        return "invalid topic"

def format_article(article):
    data = {}
    data['title'] = article.title.text
    data['link'] = article.link.text
    data['pubDate'] = article.pubDate.text
    data['description'] = article.description.text
    return data

In [None]:
lang,topic,hours='en','BUSINESS',1
topic_url = make_topic_url(lang,topic,hours)
response = BeautifulSoup(requests.get(topic_url).content,'xml')
articles = [format_article(article) for article in response.find_all("item")]
print(len(articles))
articles[0]

as we can see the keywods with score greatec than 0.5 are looking good !!!! YAY !! Happy EXtraction

In [None]:
for article in articles[:10]:
    text = article['title']
    print(text)
    keywords = key_extractor.extract_keywords(text,do_lower=True)
    print(keywords)
    print("-----------------------------------------------------------------------")