In [1]:
import nltk
import spacy
import numpy as np
import pandas as pd
import networkx as nx
from nltk.corpus import stopwords
from sumy.utils import get_stop_words
from sumy.nlp.stemmers import Stemmer
from __future__ import absolute_import
from nltk.tokenize import sent_tokenize
from sumy.parsers.html import HtmlParser
from gensim.summarization import keywords
from sumy.nlp.tokenizers import Tokenizer
from gensim.summarization import summarize
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sklearn.metrics.pairwise import cosine_similarity
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.edmundson import EdmundsonSummarizer
from __future__ import division, print_function, unicode_literals

In [2]:
file = "Our_Past_1_Chapter2.txt"

In [3]:
para_data = [[]]
para_name = ''
chapter_name = ''
EndOfChap = 0
with open(file, encoding="utf8") as f:
    for count, line in enumerate(f):
        if EndOfChap < 2:
            if 'CHAPTER' in line:
                chapter_name = line.split(':')[1].strip()
                continue
                
            if line == '\n':
                para_data.append([])
                EndOfChap +=1
                
            elif para_data[-1] == [] and para_name == '':
                para_name = line
                
            else:
                para_descr = line
                para_data[-1] = [chapter_name, para_name, para_descr]
                para_name = ''
                EndOfChap  = 0
                

para_data = [i for i in para_data if i != []]
for o_index in range(len(para_data)):
    for i_index in range(len(para_data[o_index])):
        para_data[o_index][i_index] = para_data[o_index][i_index].strip()

In [4]:
df_para = pd.DataFrame(para_data, columns = ['Chapter Name', 'Paragraph Title', 'Description'])
df_para.index = np.arange(1,len(df_para)+1)

In [5]:
df_para.head(5)

Unnamed: 0,Chapter Name,Paragraph Title,Description
1,FROM HUNTING-GATHERING TO GROWING FOOD,Tushar’s train journey,Tushar was going from Delhi to Chennai for his...
2,FROM HUNTING-GATHERING TO GROWING FOOD,The earliest people: why were they on the move ?,We know about people who lived in the subconti...
3,FROM HUNTING-GATHERING TO GROWING FOOD,How do we know about this place ?,Archaeologists have found some of the things h...
4,FROM HUNTING-GATHERING TO GROWING FOOD,Choose a place to live in,Look at Map 2 below. All the places marked wit...
5,FROM HUNTING-GATHERING TO GROWING FOOD,Rock paintings and what they tell us,Many of the caves in which these early people ...


In [6]:
def remove_stopwords(sen):
    stop_words = stopwords.words('english')
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

def Custom_Summary(sentences,n):
    #print('No. of Sents: ',len(sentences))
    word_embeddings = {}
    sentence_vectors = []
    summary = ''
    #print('Loading Word Embeddings...')
    
    f1 = open('glove.6B.100d.txt', encoding='utf-8')
    for line in f1:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()
    #print('Word Embeddings Loaded...')
    
    clean_sentences = pd.Series(sentences).str.strip().replace("[^a-zA-Z]", " ")   
    clean_sentences = [s.lower() for s in clean_sentences]
    stop_words = stopwords.words('english')
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
    #print('Sentence Cleaning Completed....')
    
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    #print('Vectorization Completed....')
    
    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    #print('Similarity Mattrix Formed....')
    
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    #print('Graph Formed....')
    for i in range(n):
        summary = summary + ranked_sentences[i][1] + ' '

    return summary.strip()

In [7]:
def Gensim_Summary(sentences,ratio):
    contents = ' '.join(sentences)
    return summarize(contents, ratio=ratio)

In [8]:
def LexRank_Summary(sentences,n):
    LANG = "english"
    ans = ''
    contents = ' '.join(sentences)
    parser = PlaintextParser.from_string(contents, Tokenizer(LANG))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, n) 
    for sentence in summary:
        ans = ans + str(sentence) + ' '
    return ans

In [9]:
def Luhn_Summary(sentences,n):
    LANG = "english"
    ans = ''
    contents = ' '.join(sentences)
    parser = PlaintextParser.from_string(contents, Tokenizer(LANG))
    summarizer = LuhnSummarizer()
    summary = summarizer(parser.document, n) 
    for sentence in summary:
        ans = ans + str(sentence) + ' '
    return ans

In [10]:
def LSA_Summary(sentences,n):
    LANG = "english"
    ans = ''
    contents = ' '.join(sentences)
    parser = PlaintextParser.from_string(contents, Tokenizer(LANG))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, n) 
    for sentence in summary:
        ans = ans + str(sentence) + ' '
    return ans

In [11]:
def make_sent(text_data):
    sents = nltk.sent_tokenize(text_data)
    return sents

In [12]:
df_summarized = df_para
df_summarized['Sentence List'] = df_para['Description'].apply(make_sent)

In [13]:
df_summarized['Custom Summarizer'] = df_summarized['Sentence List'].apply(Custom_Summary, args = (3,))
df_summarized['Gensim Summarizer'] = df_summarized['Sentence List'].apply(Gensim_Summary, args = (0.4,))
df_summarized['LexRank Summarizer'] = df_summarized['Sentence List'].apply(LexRank_Summary, args = (3,))
df_summarized['Luhn Summarizer'] = df_summarized['Sentence List'].apply(Custom_Summary, args = (3,))
df_summarized['LSA Summarizer'] = df_summarized['Sentence List'].apply(LSA_Summary, args = (3,))

In [14]:
df_summarized

Unnamed: 0,Chapter Name,Paragraph Title,Description,Sentence List,Custom Summarizer,Gensim Summarizer,LexRank Summarizer,Luhn Summarizer,LSA Summarizer
1,FROM HUNTING-GATHERING TO GROWING FOOD,Tushar’s train journey,Tushar was going from Delhi to Chennai for his...,[Tushar was going from Delhi to Chennai for hi...,"As he watched trees and houses fly past, his u...","As he watched trees and houses fly past, his u...",Tushar was going from Delhi to Chennai for his...,"As he watched trees and houses fly past, his u...",Tushar was going from Delhi to Chennai for his...
2,FROM HUNTING-GATHERING TO GROWING FOOD,The earliest people: why were they on the move ?,We know about people who lived in the subconti...,[We know about people who lived in the subcont...,"Second, animals move from place to place — eit...","Generally, they hunted wild animals, caught fi...","Therefore, they would have had to go elsewhere...","Second, animals move from place to place — eit...","Therefore, they would have had to go elsewhere..."
3,FROM HUNTING-GATHERING TO GROWING FOOD,How do we know about this place ?,Archaeologists have found some of the things h...,[Archaeologists have found some of the things ...,It is likely that people made and used tools o...,"Other tools were used to chop wood, which was ...",Archaeologists have found some of the things h...,It is likely that people made and used tools o...,Archaeologists have found some of the things h...
4,FROM HUNTING-GATHERING TO GROWING FOOD,Choose a place to live in,Look at Map 2 below. All the places marked wit...,"[Look at Map 2 below., All the places marked w...",All the places marked with red triangles are s...,All the places marked with red triangles are s...,Look at Map 2 below. All the places marked wit...,All the places marked with red triangles are s...,All the places marked with red triangles are s...
5,FROM HUNTING-GATHERING TO GROWING FOOD,Rock paintings and what they tell us,Many of the caves in which these early people ...,[Many of the caves in which these early people...,Many of the caves in which these early people ...,,Many of the caves in which these early people ...,Many of the caves in which these early people ...,Many of the caves in which these early people ...
6,FROM HUNTING-GATHERING TO GROWING FOOD,Sites,Sites are places where the remains of things (...,[Sites are places where the remains of things ...,These may be found on the surface of the earth...,You will learn more about different sites in l...,Sites are places where the remains of things (...,These may be found on the surface of the earth...,Sites are places where the remains of things (...
7,FROM HUNTING-GATHERING TO GROWING FOOD,Finding out about fire,Find the Kurnool caves on Map 2 (page13). Trac...,"[Find the Kurnool caves on Map 2 (page13)., Tr...",Fire could have been used for many things: as ...,,Find the Kurnool caves on Map 2 (page13). Trac...,Fire could have been used for many things: as ...,This suggests that people were familiar with t...
8,FROM HUNTING-GATHERING TO GROWING FOOD,Names and dates,Archaeologists have given lengthy names for th...,[Archaeologists have given lengthy names for t...,"Very often, we use present day names of the pl...",The Palaeolithic period extends from 2 million...,The name points to the importance of finds of ...,"Very often, we use present day names of the pl...",Archaeologists have given lengthy names for th...
9,FROM HUNTING-GATHERING TO GROWING FOOD,A changing environment,"Around 12,000 years ago, there were major chan...","[Around 12,000 years ago, there were major cha...",It is likely that this helped people to start ...,,"Around 12,000 years ago, there were major chan...",It is likely that this helped people to start ...,"In many areas, this led to the development of ..."
10,FROM HUNTING-GATHERING TO GROWING FOOD,The beginning of farming and herding,This was also a time when several grain bearin...,[This was also a time when several grain beari...,People could also attract and then tame animal...,People could also attract and then tame animal...,"Men, women and children probably collected the...",People could also attract and then tame animal...,This was also a time when several grain bearin...


In [15]:
df_summarized.to_csv('Paragraph-Wise Summarization.csv')

In [16]:
df_summarized['Luhn Summarizer'][2]

'Second, animals move from place to place — either in search of smaller prey, or, in the case of deer and wild cattle, in search of grass and leaves. People living on their banks would have had to go in search of water during the dry seasons (winter and summer). First, if they had stayed at one place for a long time, they would have eaten up all the available plant and animal resources.'

In [17]:
df_summarized['LSA Summarizer'][2]

'Therefore, they would have had to go elsewhere in search of food. That is why those who hunted them had to follow their movements. People living on their banks would have had to go in search of water during the dry seasons (winter and summer). '