In [1]:
import numpy as np     #here I am importing pandas
import pandas as pd    #here I am importing numpy
import nltk
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv("dataset.csv") #reading the task dataset which is converted to csv file for easier execution by the compiler

In [3]:
df.head()    #displaying the first 5 rows to show that dataset is being read successfully

Unnamed: 0.1,Unnamed: 0,Introduction
0,,Acnesol Gel is an antibiotic that fights bacte...
1,,Ambrodil Syrup is used for treating various re...
2,,Augmentin 625 Duo Tablet is a penicillin-type ...
3,,Azithral 500 Tablet is an antibiotic used to t...
4,,Alkasol Oral Solution is a medicine used in th...


In [4]:
from nltk.tokenize import sent_tokenize               #tokenizing string descrription to a list of sentences
sentences = []
for s in df['Introduction']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list
sentences[:5]

['Acnesol Gel is an antibiotic that fights bacteria.',
 'It is used to treat acne, which appears as spots or pimples on your face, chest or back.',
 'This medicine works by attacking the bacteria that cause these pimples.Acnesol Gel is only meant for external use and should be used as advised by your doctor.',
 'You should normally wash and dry the affected area before applying a thin layer of the medicine.',
 'It should not be applied to broken or damaged skin.']

In [5]:

# GloVe is an unsupervised learning algorithm for obtaining vector representations for words.
#These word embeddings will be used to create vectors for our sentences.

# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [6]:
#stopwords are actually the words not useful to us for including in summary

# here I am removing punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# here I am converting alphabets to lowercase
clean_sentences = [s.lower() for s in clean_sentences]
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords           #importing stopwords from nltk library
stop_words = stopwords.words('english')

# function to remove stopwords

def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

# now I am removing stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

# extract word vectors

word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')          #using glove
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [8]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [9]:
# now we are using the concept of similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [10]:
#now I am using Cosine Similarity to compute the similarity between a pair of sentences

from sklearn.metrics.pairwise import cosine_similarity


In [11]:

#here I am initializing the matrix with cosine similarity scores

for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [12]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)     #converting the similarity matrix sim_mat into a graph
scores = nx.pagerank(nx_graph)              #now I am using the concept of page rank algorithm

In [14]:
#sorting sentences according to their decreasing rank
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)      

In [48]:

#Here I am showing you how my model works by taking the first one as a example
#You can process all the descriptions of dataset too
#I have not displayed the whole dataset as it was taking a lot of time to process and I have a low computational power on my system
#The model is working fine on all data descriptions given n dataset.

print("Initial Data :-")
print(df['Introduction'][j])
print("\n")
print("Short Summary :- ")     #of around 20 words
print((sentences[0]+sentences[1]))
print("\n")
print("Long Summary :- ")     #of around 70 words
print((sentences[0]+sentences[1]+sentences[2]))
# Extract top 3 sentences as the summary
for i in range(2):            # i am using ranked sentences for long summary along in addition o the content of short summary as it covers important precautions
    print(ranked_sentences[i][1])


Initial Data :-
Acnesol Gel is an antibiotic that fights bacteria. It is used to treat acne, which appears as spots or pimples on your face, chest or back. This medicine works by attacking the bacteria that cause these pimples.Acnesol Gel is only meant for external use and should be used as advised by your doctor. You should normally wash and dry the affected area before applying a thin layer of the medicine. It should not be applied to broken or damaged skin. Avoid any contact with your eyes, nose, or mouth. Rinse it off with water if you accidentally get it in these areas. It may take several weeks for your symptoms to improve, but you should keep using this medicine regularly. Do not stop using it as soon as your acne starts to get better. Ask your doctor when you should stop treatment.Common side effects like minor itching, burning, or redness of the skin and oily skin may be seen in some people. These are usually temporary and resolve on their own. Consult your doctor if they both