# Extractive Summarization

#### Required Libraries

In [86]:
import os
import sys
import json
import pickle
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import operator
import numpy as np
import pandas as pd
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [87]:
# Intializing the Word2Vec Model, download the file from https://nlp.stanford.edu/projects/glove/
# Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors, 822 MB download): glove.6B.zip 
# Unzip the file then run: python3 -m gensim.scripts.glove2word2vec --input  glove.6B.300d.txt --output glove.6B.300d.w2vformat.txt
model = gensim.models.KeyedVectors.load_word2vec_format('data/glove.6B.300d.w2vformat.txt')
word_freq_map = {}
# with open("vocab", 'r') as vocab_file:
with open("data/arxiv-release/vocab", 'r') as vocab_file:
    lines = vocab_file.readlines()
    for line in lines:
        word_freq_map[line.split()[0]] = int(line.split()[1])
    
stop_list = sorted(word_freq_map.items(), key=operator.itemgetter(1), reverse=True)[:150]
cache = {}

#### Read the full training data and split it into smaller chunks

In [88]:
def read_data_and_split(train_corpus_path):
    train_data_map = {}
    file_no = 0
    with open(train_corpus_path, 'r') as train_data_file:
        line_count = 0
        while file_no < 11:
            if line_count < 20000:
                line_data = train_data_file.readline()
                if line_data:
                    line_map = json.loads(line_data)
                    article_id = line_map['article_id']
                    del line_map['article_id']
                    train_data_map[article_id] = line_map
                    line_count += 1
                else:
                    break
            else:
                with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
                    pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)
                print("File ", file_no, " Done")
                train_data_map.clear()
                file_no += 1
                line_count = 0
        with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
            pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)

#### Loads a particular pickle file of the training data into memory

In [89]:
def load_data_from_pickle(train_corpus_path):
    data_path = train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain"
    data_map = {}
    with open(data_path + "/" + "train_0.pickle", 'rb') as handle:
        data_map = pickle.load(handle)
    return data_map

#### Gets all the sentences of the article along with its metadata

In [90]:
def get_sentences_with_metadata(data_map):
    full_text = []
    sentence_metadata = []
    list_of_sentences = []
    c = 0
    for article_id, data in data_map.items():
        if c == 60:
            section_data = data['sections']
            section_names = data['section_names']
            for i, section in enumerate(section_data):
                for line in section:
                    split_line = line.split('.')
                    for l in split_line:
                        list_of_sentences.append(l)
                        sentence_metadata.append(section_names[i])
        c += 1
    return list_of_sentences, sentence_metadata

#### The Following 2 functions are used for Preprocessing of a given text

In [91]:
def is_ascii(word):
    """
    Checks if word is ascii or not
    :param word: token
    :return: Boolean
    """
    valid = True
    try:
        word = word.encode('ascii')
    except UnicodeEncodeError:
        valid = False
    return valid

In [92]:
def get_processed_tokens(sentence):
    punc_map = {}
    punc_map = punc_map.fromkeys('!"\'()*+,;<>[]^`{|}~:=%&_#?-$/', ' ')
    table = str.maketrans(punc_map)
    tokens = sentence.lower().translate(table).split()
    stop_words = set(stopwords.words('english')) 
    stop_words = list(stop_words) + stop_list
    cleaned_tokens = [word for word in tokens if word not in stop_words and is_ascii(word) and '@' not in word and '\\' not in word and len(word) > 1]            
    return cleaned_tokens

#### Gets the processed sentences for each sentence of the article

In [93]:
def make_processed_sentences(list_of_sentences):
    processed_sentences = []
    for sentence in list_of_sentences:
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
        processed_sentences.append(get_processed_tokens(sentence))
    return processed_sentences

#### Gives the number of words common between given 2 sentences

In [94]:
def get_no_of_common_word(sentence1, sentence2):
    common_count = 0
    for s1 in sentence1:
        for s2 in sentence2:
            if s1 == s2:
                common_count += 1
    return common_count

In [95]:
def get_word_vec_sim(sentence1, sentence2):
    score = 0
    for word1 in sentence1:
        for word2 in sentence2:
            try:
                temp = cache[word1+word2]
            except:
                try:
                    temp = model.similarity(word1, word2)
                    cache[word1+word2] = temp
                    cache[word2+word1] = temp
                except:
                    cache[word1+word2] = 0
                    cache[word2+word1] = 0
                    temp = 0
            score += temp
    return score

#### Generic scoring function which gives a score between 2 sentences

In [96]:
def scoring(sentence1, sentence2, metadata):
    len_normalize = len(sentence1) + len(sentence2) + 1 # Normalizing by length of vector
    common_words = get_no_of_common_word(sentence1, sentence2)
    word_vec_score = get_word_vec_sim(sentence1, sentence2)
    score = common_words / 2*len_normalize + word_vec_score / len_normalize
    return score

#### Makes the graph which has relations between every pair of sentences

In [97]:
def make_graph(processed_sentences, metadata):
    sentence_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    sentence_common_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    
    for i in range(len(processed_sentences)):
        for j in range(len(processed_sentences)):
            sentence1 = processed_sentences[i]
            sentence2 = processed_sentences[j]
            if i == j:
                sentence_graph[i][j] = 0
                sentence_common_graph[i][j] = 0
            else:
                sentence_graph[i][j] = scoring(sentence1, sentence2, metadata)
                sentence_common_graph[i][j] = get_no_of_common_word(sentence1, sentence2)
    return sentence_graph, sentence_common_graph

#### Following functions are different ways to give a score to a sentence

##### (1) Aggregation

In [98]:
def calculate_scores(sentence_graph):
    scores = np.zeros(len(sentence_graph))
    for i,sentence in enumerate(sentence_graph):
        scores[i] = sum(sentence_graph[i])
    return scores

##### (2) Page Rank

In [99]:
def calculate_pagerank_scores(sentence_graph):
    N = len(sentence_graph)
    d = 0.15   # PageRank Hyperparameter
    pagerank_scores = np.ones(N)
    
    out_degree = np.zeros(N)
    for i in range(N):
        for j in range(N):
            if sentence_graph[i][j]:
                out_degree[i] += sentence_graph[i][j]
    
    for i in range(N):
        score = 0
        for j in range(N):
            if sentence_graph[j][i]:
                score += (pagerank_scores[j] / out_degree[j])
        pagerank_scores[i] = (d / N) + (1 - d) * score
    return pagerank_scores    

#### Ranks the sentences based on any one of the above scoring methods and return the Summary

In [100]:
def rank_sentences_and_make_summary2(sentences, processed_sentences, sentence_graph, scores):
    scores_indices = np.argsort(scores)
    ordered_sentences = scores_indices[::-1]
    summary = []
    for i in range(5):
        summary.append(sentences[ordered_sentences[i]])
#         print(ordered_sentences[i], scores[ordered_sentences[i]])
#         print(processed_sentences[ordered_sentences[i]])
    return summary

In [101]:
def rank_sentences_and_make_summary(sentences, processed_sentences, sentence_graph, scores, summary_length):
    summary = []
    for i in range(5): # Number of Sentences we want in the summary
        score_indices = np.argsort(scores)
        selected_index = score_indices[-1]
        summary.append(sentences[selected_index]) # Adding highest score sentence to summary
        mean_score = np.mean(sentence_graph)
        to_decrease = []
        # Calculated mean similarity score. If selected sentence and another sentence have
        # high similarity, the score of the second sentence should be reduced.
        # Here, have chosen to use 1.5 * mean_score as the threshold, and divided score in half.
        for iterator in range(len(processed_sentences)):
            if sentence_graph[iterator][selected_index] > 1.5 * mean_score:
                to_decrease.append(iterator)
            if sentence_graph[selected_index][iterator] > 1.5 * mean_score:
                to_decrease.append(iterator)
        for sentence in set(to_decrease):
            # Should be changed based on the number of sentences needed in the summary
            scores[sentence] /= (1 + 1.0 / summary_length) # Reduced score by half, to on average prevent from being picked.
        scores[selected_index] = 0
    return summary
        

#### Main Program which calls the above defined functions

In [102]:
train_corpus_path = "data/arxiv-release/train.txt"
# train_corpus_path = "/media/kaushik/Studies/IIITH/3_ThirdSem/IRE/Major Project/arxiv-release/arxiv-release/train.txt"
#read_data_and_split(train_corpus_path)

In [103]:
data_map = load_data_from_pickle(train_corpus_path)

In [104]:
list_of_sentences, sentence_metadata = get_sentences_with_metadata(data_map)
list_of_sentences = [sentence.strip() for sentence in list_of_sentences if len(sentence) > 1]

In [105]:
processed_sentences = make_processed_sentences(list_of_sentences)

In [106]:
sentence_graph, sentence_common_graph = make_graph(processed_sentences, sentence_metadata)

  if np.issubdtype(vec.dtype, np.int):


In [107]:
sentence_scores = calculate_scores(sentence_graph)

In [108]:
sentence_page_scores = calculate_pagerank_scores(sentence_common_graph)

In [109]:
sentence_score_final = [sentence_scores[i] * (sentence_page_scores[i]+1)  for i in range(len(sentence_scores))]

In [110]:
summary_length = 5
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_scores, summary_length)
summary

['the hamiltonian modeling this system consists of the ground and one excited state of the molecule and a quasi - continuum describing the conduction band together with one vibrational coordinate @xmath15 here @xmath16 can be equal to @xmath17 for the ground state , @xmath18 for the excited state , and @xmath19 for the quasi - continuum',
 'first , the vibrational populations in the excited state of the molecule no longer only decay into the quasi - continuum states but also relax within the excited state ( see fig',
 'therefore the total hamiltonian consists of three terms  the system part @xmath1 , the bath part @xmath2 , and the system - bath interaction @xmath3 : @xmath4 the rdm @xmath5 is obtained from the density matrix of the full system by tracing out the degrees of freedom of the environment',
 'the electronic probabilities in the quasi - continuum are given as @xmath30 where @xmath31 is the initial vibronic distribution in the excited state and @xmath32 and @xmath33 are the v

In [111]:
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_score_final, summary_length)
summary

['the hamiltonian modeling this system consists of the ground and one excited state of the molecule and a quasi - continuum describing the conduction band together with one vibrational coordinate @xmath15 here @xmath16 can be equal to @xmath17 for the ground state , @xmath18 for the excited state , and @xmath19 for the quasi - continuum',
 'therefore the total hamiltonian consists of three terms  the system part @xmath1 , the bath part @xmath2 , and the system - bath interaction @xmath3 : @xmath4 the rdm @xmath5 is obtained from the density matrix of the full system by tracing out the degrees of freedom of the environment',
 'first , the vibrational populations in the excited state of the molecule no longer only decay into the quasi - continuum states but also relax within the excited state ( see fig',
 'recently @xcite the electron injection from a chromophore to a semiconductor conduction band was described using the time - dependent schrdinger equation , thus neglecting relaxation p

In [112]:
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_page_scores, summary_length)
summary
# make_processed_sentences(summary)

['if one assumes bilinear system - bath coupling with system part @xmath8 and bath part @xmath9 @xmath10 one can take advantage of the following decomposition @xcite : @xmath11   +    [ \\lambda\\rho , k]+   [ k,\\rho\\lambda^{\\dagger } ]',
 'recently @xcite the electron injection from a chromophore to a semiconductor conduction band was described using the time - dependent schrdinger equation , thus neglecting relaxation processes',
 'we are aware that this is only a minimal model but hope that it catches the effects of dissipation on the electron injection process',
 'the hamiltonian modeling this system consists of the ground and one excited state of the molecule and a quasi - continuum describing the conduction band together with one vibrational coordinate @xmath15 here @xmath16 can be equal to @xmath17 for the ground state , @xmath18 for the excited state , and @xmath19 for the quasi - continuum',
 'to be able to study the effects of dissipation']