In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') 
import re
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to C:\Users\Sandesh
[nltk_data]     Rangreji\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#DOWNLOAD GLOVE WORD EMBEDDINGS
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove*.zip

In [3]:
# Extract word vectors

def wordEmbeddings():  
    word_embeddings = {}
    f = open('glove.6B.100d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    return word_embeddings

In [4]:
#TEXT PREPROCESSING
def preprocess(sample):
    # remove punctuations and special characters
    clean_sample = re.sub('\W+',' ', sample )
    # make alphabets lowercase
    clean_sample=clean_sample.lower()
    #remove stopwords
    clean_sample= remove_stopwords(clean_sample.split())
    return clean_sample


In [5]:
# function to remove stopwords
nltk.download('stopwords')
def remove_stopwords(sen):
    from nltk.corpus import stopwords
    stop_words = stopwords.words('english')
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

[nltk_data] Downloading package stopwords to C:\Users\Sandesh
[nltk_data]     Rangreji\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def word_list(clean_samples):
    words=[]
    for k in clean_samples.split():
        words.append(k)
    return words

In [7]:
def textRank(clean_samples):
    words=word_list(clean_samples)
    
    # initializes similarity matrix
    sim_mat = np.zeros([len(words), len(words)])
    
    from sklearn.metrics.pairwise import cosine_similarity
    
    #calculates similarity matrix for the sentence/word list
    i=0
    j=0
    for a in words:
        for b in words:
            if i != j:
                sim_mat[i][j] = cosine_similarity(word_embeddings[a].reshape(1,100), word_embeddings[b].reshape(1,100))[0,0]
                j=j+1
        i=i+1
    
    #PAGE RANK
    import networkx as nx
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    #calulates most important words
    ranked_words = sorted(((scores[i],s) for i,s in enumerate(words)), reverse=True)
    return ranked_words

In [8]:
word_embeddings=wordEmbeddings()    #excecution takes time

In [9]:
samples=["What was the scoreline of the manu match yesterday night?",
         "Read me my emails",
         "Turn on the lights in the hall.",
         "How many coronavirus cases are there in India?"]
#print(samples)
for sample in samples:
    clean_sample=preprocess(sample)
    ranked_words=textRank(clean_sample)
    print(sample)
    for i in range(len(ranked_words)):
      print(ranked_words[i][1])

What was the scoreline of the manu match yesterday night?
match
manu
yesterday
night
scoreline
Read me my emails
read
emails
Turn on the lights in the hall.
lights
turn
hall
How many coronavirus cases are there in India?
cases
coronavirus
india
many
