In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from numpy.linalg import svd
import networkx as nx
from rouge import Rouge

In [2]:
file=open('ext_file.txt', 'r')
text = file.read()
text

"The Ghaziabad municipal corporation plans to create 20 'mini forests' across the city by the end of this year in a bid to improve the greenery and forest cover, said officials on Wednesday. Officials added that four other plantation sites which were developed as a pilot project have shown a plant survival rate of about 95%. The corporation took up the four sites at Sai Upwan (city forest near GT Road where it planted 2,100 trees), Pratap Vihar (5,000 trees), UP Gate (5,000 trees) and Dayanand Park in Kavi Nagar industrial area (about 25,000 trees). The plantation drive at these four sites was initiated in 2021 as part of urban forestry and to spruce up the green cover. “Overall, the sites have shown a high plant survival rate of about 95% and one of the sites at Sai Upwan has almost 100% survival rate due to its proximity to the Hindon river. The plantation drive at these sites was taken up under the 'Miyawaki' method (planting two to four trees per square metre area). Further, we hav

In [3]:
reference = ["""The Ghaziabad municipal corporation is planning to create 20 'mini-forests' across the city by the end of this year to improve the greenery and forest cover, officials said. Officials added that four other plantation sites which were developed as a pilot project, showed a plant survival rate of about 95%. The plantation drive at these sites was initiated in 2021."""]
reference

["The Ghaziabad municipal corporation is planning to create 20 'mini-forests' across the city by the end of this year to improve the greenery and forest cover, officials said. Officials added that four other plantation sites which were developed as a pilot project, showed a plant survival rate of about 95%. The plantation drive at these sites was initiated in 2021."]

In [4]:
sentences = sent_tokenize(text)
sentences

["The Ghaziabad municipal corporation plans to create 20 'mini forests' across the city by the end of this year in a bid to improve the greenery and forest cover, said officials on Wednesday.",
 'Officials added that four other plantation sites which were developed as a pilot project have shown a plant survival rate of about 95%.',
 'The corporation took up the four sites at Sai Upwan (city forest near GT Road where it planted 2,100 trees), Pratap Vihar (5,000 trees), UP Gate (5,000 trees) and Dayanand Park in Kavi Nagar industrial area (about 25,000 trees).',
 'The plantation drive at these four sites was initiated in 2021 as part of urban forestry and to spruce up the green cover.',
 '“Overall, the sites have shown a high plant survival rate of about 95% and one of the sites at Sai Upwan has almost 100% survival rate due to its proximity to the Hindon river.',
 "The plantation drive at these sites was taken up under the 'Miyawaki' method (planting two to four trees per square metre a

In [5]:
leng = []
for sen in sentences:
    leng.append(len(sen.split()))
leng

[33,
 23,
 39,
 23,
 34,
 22,
 23,
 22,
 37,
 36,
 30,
 18,
 25,
 40,
 17,
 15,
 18,
 21,
 19,
 15,
 11,
 26,
 15,
 38]

In [6]:
n = int(input('number of words needed in summary'))
if n < max(leng):
    n = max(leng)

In [7]:
def remove_sw(text):
    text1 = text.lower()
    text1 = re.sub('[^a-zA-Z]', ' ' , text1)
    text_tokens = word_tokenize(text1)
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    
    for word in tokens_without_sw:
        if word in text.lower():
            continue
        else:
            tokens_without_sw.remove(word)

    return tokens_without_sw

In [8]:
def word_freq(tokens_without_sw):
    freq = {}
    for word in tokens_without_sw:
        if word in freq:
            freq[word] += 1
        else:
            freq[word] = 1
    return freq

In [9]:
def sentence_rank(sentences_list, freq_dict):
    rank = {}
    for s in sentences_list:
        for word, fre in freq_dict.items():
            if word in s.lower():
                if s in rank:
                    rank[s] += fre
                else:
                    rank[s] = fre
    return rank

In [10]:
def sentence_vector(sentences,vocabulary):
    sens_vec = []
    for s in sentences:
        sen_vec = []
        for word in vocabulary:
            if word in s.lower():
                #sp = s.lower().split()
                #r = sp.count(word)
                sen_vec.append(1)
            else:
                sen_vec.append(0)
        sens_vec.append(sen_vec)
    return sens_vec

### Frequency Method

In [11]:
def frequency_method(text, sentences, n):
    tokens_without_sw = remove_sw(text)
    freq=word_freq(tokens_without_sw)
    rank = sentence_rank(sentences, freq)
    
    l = sorted(rank, key=rank.get, reverse = True)
    
    summary = ''
    w=0
    u=len(l[0].split())
    while u <= n:
        summary += " " + l[w]
        w += 1
        u += len(l[w].split())
    return summary

### Text Rank Method

In [12]:
def text_rank_method(text, sentences, n):
    tokens_without_sw = remove_sw(text)
    vocabulary = sorted(tokens_without_sw)
    sens_vec = sentence_vector(sentences,vocabulary)

    m = np.array(sens_vec)
    dis = 1-pairwise_distances(m, metric="cosine") 

    sen_similarity_graph = nx.from_numpy_array(dis)
    score = nx.pagerank(sen_similarity_graph)
    score = score.values()

    similarity = dict(zip(sentences, score))

    l = sorted(similarity, key=similarity.get, reverse = True)

    summary = ''
    w=0
    u=len(l[0].split())
    while u <= n:
        summary += " " + l[w]
        w += 1
        u += len(l[w].split())
    return summary

### TF-IDF Method

In [13]:
def tf_idf_method(text, sentences, n):
    tokens_without_sw = remove_sw(text)
    freq=word_freq(tokens_without_sw)
    total_words = sum(freq.values())

    tf = {}
    for word in freq.keys():
        tf[word] = freq[word]/total_words

    num_sen = {}
    idf = {}
    for word in freq.keys():
        num = 0
        for s in sentences:
            if word in s.lower():
                num += 1
        num_sen[word] = num
        idf[word] = np.log(len(sentences)/num)

    score = {}
    for word in freq.keys():
        score[word] = tf[word]*idf[word]

    rank = sentence_rank(sentences, score)

    l = sorted(rank, key=rank.get, reverse = True)
        
    summary = ''
    w=0
    u=len(l[0].split())
    while u <= n:
        summary += " " + l[w]
        w += 1
        u += len(l[w].split())
    return summary

### TF-IDF Method without removing stop words

In [14]:
def tf_idf_method_wsw(text, sentences, n):

    text1 = text.lower()
    text1 = re.sub('[^a-zA-Z]', ' ' , text1)
    text_tokens = word_tokenize(text1)
    
    for word in text_tokens:
        if word in text.lower():
            continue
        else:
            text_tokens.remove(word)

    tokens_with_sw = text_tokens
    
    freq=word_freq(tokens_with_sw)
    total_words = sum(freq.values())

    tf = {}
    for word in freq.keys():
        tf[word] = freq[word]/total_words

    num_sen = {}
    idf = {}
    for word in freq.keys():
        num = 0
        for s in sentences:
            if word in s.lower():
                num += 1
        num_sen[word] = num
        idf[word] = np.log(len(sentences)/num)

    score = {}
    for word in freq.keys():
        score[word] = tf[word]*idf[word]

    rank = sentence_rank(sentences, score)

    l = sorted(rank, key=rank.get, reverse = True)
        
    summary = ''
    w=0
    u=len(l[0].split())
    while u <= n:
        summary += " " + l[w]
        w += 1
        u += len(l[w].split())
    return summary

### Low dimensional reduction Method

In [15]:
def low_dim_red(text, sentences, n):
    tokens_without_sw = remove_sw(text)
    vocabulary = sorted(tokens_without_sw)
    sens_vec = sentence_vector(sentences,vocabulary)
    m = np.array(sens_vec)

    U,S,VT = svd(np.transpose(m))
    k=VT

    dis = 1-pairwise_distances(k, metric="cosine") 
    
    sen_similarity_graph = nx.from_numpy_array(dis)
    score = nx.pagerank(sen_similarity_graph)
    score = score.values()
    
    similarity = dict(zip(sentences, score))

    l = sorted(similarity, key=similarity.get, reverse = True)

    summary = ''
    w=0
    u=len(l[0].split())
    while u <= n:
        summary += " " + l[w]
        w += 1
        u += len(l[w].split())
    return summary

### Word rank Method (significant words are chosen with similarity check and then ranked sentences accordingly)

In [16]:
def word_rank_method(text, sentences, n):
    tokens_without_sw = remove_sw(text)
    vocabulary = sorted(tokens_without_sw)
    sens_vec = sentence_vector(sentences,vocabulary)

    m = np.transpose(np.array(sens_vec))
    dis = 1-pairwise_distances(m, metric="cosine") 
    
    word_similarity_graph = nx.from_numpy_array(dis)
    score = nx.pagerank(word_similarity_graph)
    score = score.values()
    
    similarity = dict(zip(vocabulary, score))

    rank = sentence_rank(sentences, similarity)

    l = sorted(rank, key=rank.get, reverse = True)

    summary = ''
    w=0
    u=len(l[0].split())
    while u <= n:
        summary += " " + l[w]
        w += 1
        u += len(l[w].split())
    return summary

### Comparison

In [17]:
def methods(text, sentences, n):
    sum = {}
    sum['tfidf'] = tf_idf_method(text, sentences, n)
    sum['frequency'] = frequency_method(text, sentences, n)
    sum['testrank'] = text_rank_method(text, sentences, n)
    sum['tfidfwsw'] = tf_idf_method_wsw(text, sentences, n)
    sum['lowdim'] = low_dim_red(text, sentences, n)
    sum['wordrank'] = word_rank_method(text, sentences, n)
    return sum

In [18]:
def evaluation(reference, model_out):
    rouge = Rouge()
    m=rouge.get_scores(model_out, reference, avg=True)
    return m


In [19]:
k = methods(text, sentences, n)
k

{'tfidf': ' The corporation took up the four sites at Sai Upwan (city forest near GT Road where it planted 2,100 trees), Pratap Vihar (5,000 trees), UP Gate (5,000 trees) and Dayanand Park in Kavi Nagar industrial area (about 25,000 trees).',
 'frequency': ' The corporation took up the four sites at Sai Upwan (city forest near GT Road where it planted 2,100 trees), Pratap Vihar (5,000 trees), UP Gate (5,000 trees) and Dayanand Park in Kavi Nagar industrial area (about 25,000 trees).',
 'testrank': " The plantation drive at these four sites was initiated in 2021 as part of urban forestry and to spruce up the green cover. The Ghaziabad municipal corporation plans to create 20 'mini forests' across the city by the end of this year in a bid to improve the greenery and forest cover, said officials on Wednesday.",
 'tfidfwsw': ' A similar activity of about 30,000 trees has been planned at Shakti Khand legacy-waste cleared site in Indirapuram, while 10 other sites include industrial areas whe

In [20]:
rog_score = {}
for i in range(len(k)):
    model_out = [list(k.values())[i]]
    rog_score[list(k.keys())[i]] = evaluation(reference, model_out)

In [21]:
print(rog_score)

{'tfidf': {'rouge-1': {'r': 0.17307692307692307, 'p': 0.25, 'f': 0.20454544971074393}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.1346153846153846, 'p': 0.19444444444444445, 'f': 0.1590909042561985}}, 'frequency': {'rouge-1': {'r': 0.17307692307692307, 'p': 0.25, 'f': 0.20454544971074393}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.1346153846153846, 'p': 0.19444444444444445, 'f': 0.1590909042561985}}, 'testrank': {'rouge-1': {'r': 0.6538461538461539, 'p': 0.723404255319149, 'f': 0.6868686818814408}, 'rouge-2': {'r': 0.4576271186440678, 'p': 0.4909090909090909, 'f': 0.4736842055324716}, 'rouge-l': {'r': 0.5961538461538461, 'p': 0.6595744680851063, 'f': 0.6262626212753801}}, 'tfidfwsw': {'rouge-1': {'r': 0.17307692307692307, 'p': 0.2571428571428571, 'f': 0.20689654691504833}, 'rouge-2': {'r': 0.01694915254237288, 'p': 0.027777777777777776, 'f': 0.021052626872023215}, 'rouge-l': {'r': 0.1346153846153846, 'p': 0.2, 'f': 0.16091953542079548}}, 'lowdim':