# Extractive Summarization

#### Required Libraries

In [1]:
import os
import sys
import json
import pickle
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import operator
import numpy as np
import pandas as pd
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [2]:
# Intializing the Word2Vec Model, download the file from https://nlp.stanford.edu/projects/glove/
# Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors, 822 MB download): glove.6B.zip 
# Unzip the file then run: python3 -m gensim.scripts.glove2word2vec --input  glove.6B.300d.txt --output glove.6B.300d.w2vformat.txt
model = gensim.models.KeyedVectors.load_word2vec_format('data/glove.6B.300d.w2vformat.txt')

In [3]:
word_freq_map = {}
with open("data/arxiv-release/vocab", 'r') as vocab_file:
    lines = vocab_file.readlines()
    for line in lines:
        word_freq_map[line.split()[0]] = int(line.split()[1])
    
stop_list = sorted(word_freq_map.items(), key=operator.itemgetter(1), reverse=True)[:150]
cache = {}

#### Read the full training data and split it into smaller chunks

In [4]:
def read_data_and_split(train_corpus_path):
    train_data_map = {}
    file_no = 0
    with open(train_corpus_path, 'r') as train_data_file:
        line_count = 0
        while file_no < 11:
            if line_count < 20000:
                line_data = train_data_file.readline()
                if line_data:
                    line_map = json.loads(line_data)
                    article_id = line_map['article_id']
                    del line_map['article_id']
                    train_data_map[article_id] = line_map
                    line_count += 1
                else:
                    break
            else:
                with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
                    pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)
                print("File ", file_no, " Done")
                train_data_map.clear()
                file_no += 1
                line_count = 0
        with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
            pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)

#### Loads a particular pickle file of the training data into memory

In [5]:
def load_data_from_pickle(train_corpus_path):
    data_path = train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain"
    data_map = {}
    with open(data_path + "/" + "train_0.pickle", 'rb') as handle:
        data_map = pickle.load(handle)
    return data_map

#### Gets all the sentences of the article along with its metadata

In [42]:
def get_sentences_with_metadata(data_map):
    full_text = []
    sentence_metadata = []
    list_of_sentences = []
    summary_list = []
    abstract_list = []
    c = 0
    file_number = 1
    for article_id, data in tqdm(data_map.items()):
        if c < 10:
            abstract_list.append(data['abstract_text'])
            section_data = data['sections']
            section_names = data['section_names']
            for i, section in enumerate(section_data):
                for line in section:
                    split_line = line.split('.')
                    for l in split_line:
                        list_of_sentences.append(l)
                        sentence_metadata.append(section_names[i])
            summary_list.append(do_stuff_and_get_summary(list_of_sentences, sentence_metadata))
            list_of_sentences.clear()
        else:
            c = 0
            write_summary_and_abstract_to_file(summary_list, abstract_list, file_number)
            print("File ", file_number, " done")
            file_number += 1
            summary_list.clear()
            abstract_list.clear()
            break
        c += 1
        
    return list_of_sentences, sentence_metadata

#### The Following 2 functions are used for Preprocessing of a given text

In [7]:
def is_ascii(word):
    """
    Checks if word is ascii or not
    :param word: token
    :return: Boolean
    """
    valid = True
    try:
        word = word.encode('ascii')
    except UnicodeEncodeError:
        valid = False
    return valid

In [8]:
def get_processed_tokens(sentence):
    punc_map = {}
    punc_map = punc_map.fromkeys('!"\'()*+,;<>[]^`{|}~:=%&_#?-$/', ' ')
    table = str.maketrans(punc_map)
    tokens = sentence.lower().translate(table).split()
    stop_words = set(stopwords.words('english')) 
    stop_words = list(stop_words) + stop_list
    cleaned_tokens = [word for word in tokens if word not in stop_words and is_ascii(word) and '@' not in word and '\\' not in word and len(word) > 1]            
    return cleaned_tokens

#### Gets the processed sentences for each sentence of the article

In [9]:
def make_processed_sentences(list_of_sentences):
    processed_sentences = []
    for sentence in list_of_sentences:
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
        processed_sentences.append(get_processed_tokens(sentence))
    return processed_sentences

#### Gives the number of words common between given 2 sentences

In [10]:
def get_no_of_common_word(sentence1, sentence2):
    common_count = 0
    for s1 in sentence1:
        for s2 in sentence2:
            if s1 == s2:
                common_count += 1
    return common_count

In [11]:
def get_word_vec_sim(sentence1, sentence2):
    score = 0
    for word1 in sentence1:
        for word2 in sentence2:
            try:
                temp = cache[word1+word2]
            except:
                try:
                    temp = model.similarity(word1, word2)
                    cache[word1+word2] = temp
                    cache[word2+word1] = temp
                except:
                    cache[word1+word2] = 0
                    cache[word2+word1] = 0
                    temp = 0
            score += temp
    return score

#### Generic scoring function which gives a score between 2 sentences

In [12]:
def scoring(sentence1, sentence2, metadata):
    len_normalize = len(sentence1) + len(sentence2) + 1 # Normalizing by length of vector
    common_words = get_no_of_common_word(sentence1, sentence2)
    word_vec_score = get_word_vec_sim(sentence1, sentence2)
    score = common_words / 2*len_normalize + word_vec_score / len_normalize
    return score

#### Makes the graph which has relations between every pair of sentences

In [13]:
def make_graph(processed_sentences, metadata):
    sentence_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    sentence_common_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    
    for i in range(len(processed_sentences)):
        for j in range(len(processed_sentences)):
            sentence1 = processed_sentences[i]
            sentence2 = processed_sentences[j]
            if i == j:
                sentence_graph[i][j] = 0
                sentence_common_graph[i][j] = 0
            else:
                sentence_graph[i][j] = scoring(sentence1, sentence2, metadata)
                sentence_common_graph[i][j] = get_no_of_common_word(sentence1, sentence2)
    return sentence_graph, sentence_common_graph

#### Following functions are different ways to give a score to a sentence

##### (1) Aggregation

In [14]:
def calculate_scores(sentence_graph):
    scores = np.zeros(len(sentence_graph))
    for i,sentence in enumerate(sentence_graph):
        scores[i] = sum(sentence_graph[i])
    return scores

##### (2) Page Rank

In [15]:
def calculate_pagerank_scores(sentence_graph):
    N = len(sentence_graph)
    d = 0.15   # PageRank Hyperparameter
    pagerank_scores = np.ones(N)
    
    out_degree = np.zeros(N)
    for i in range(N):
        for j in range(N):
            if sentence_graph[i][j]:
                out_degree[i] += sentence_graph[i][j]
    
    for i in range(N):
        score = 0
        for j in range(N):
            if sentence_graph[j][i]:
                score += (pagerank_scores[j] / out_degree[j])
        pagerank_scores[i] = (d / N) + (1 - d) * score
    return pagerank_scores    

#### Ranks the sentences based on any one of the above scoring methods and return the Summary

In [16]:
def rank_sentences_and_make_summary2(sentences, processed_sentences, sentence_graph, scores):
    scores_indices = np.argsort(scores)
    ordered_sentences = scores_indices[::-1]
    summary = []
    for i in range(5):
        summary.append(sentences[ordered_sentences[i]])
#         print(ordered_sentences[i], scores[ordered_sentences[i]])
#         print(processed_sentences[ordered_sentences[i]])
    return summary

In [39]:
def rank_sentences_and_make_summary(sentences, processed_sentences, sentence_graph, scores, summary_length):
    summary = []
    for i in range(summary_length): # Number of Sentences we want in the summary
        score_indices = np.argsort(scores)
        if len(score_indices < 1):
            break
        selected_index = score_indices[-1]
        summary.append(sentences[selected_index]) # Adding highest score sentence to summary
        mean_score = np.mean(sentence_graph)
        to_decrease = []
        # Calculated mean similarity score. If selected sentence and another sentence have
        # high similarity, the score of the second sentence should be reduced.
        # Here, have chosen to use 1.5 * mean_score as the threshold, and divided score in half.
        for iterator in range(len(processed_sentences)):
            if sentence_graph[iterator][selected_index] > 1.5 * mean_score:
                to_decrease.append(iterator)
            if sentence_graph[selected_index][iterator] > 1.5 * mean_score:
                to_decrease.append(iterator)
        for sentence in set(to_decrease):
            # Should be changed based on the number of sentences needed in the summary
            scores[sentence] /= (1 + 1.0 / summary_length) # Reduced score by half, to on average prevent from being picked.
        scores[selected_index] = 0
    return summary
        

#### Main Program which calls the above defined functions

In [48]:
train_corpus_path = "data/arxiv-release/train.txt"
# train_corpus_path = "/media/kaushik/Studies/IIITH/3_ThirdSem/IRE/Major Project/arxiv-release/arxiv-release/train.txt"
# read_data_and_split(train_corpus_path)

In [49]:
data_map = load_data_from_pickle(train_corpus_path)

In [50]:
len(data_map)

20000

In [45]:
def do_stuff_and_get_summary(list_of_sentences, sentence_metadata):
    list_of_sentences = [sentence.strip() for sentence in list_of_sentences if len(sentence) > 1]
    processed_sentences = make_processed_sentences(list_of_sentences)
    sentence_graph, sentence_common_graph = make_graph(processed_sentences, sentence_metadata)
    sentence_scores = calculate_scores(sentence_graph)
    sentence_page_scores = calculate_pagerank_scores(sentence_common_graph)
    sentence_score_final = [sentence_scores[i] * (sentence_page_scores[i]+1)for i in range(len(sentence_scores))]
    summary_length = 10
    summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_score_final, summary_length)
    return summary

In [46]:
def write_summary_and_abstract_to_file(summary_list, abstract_list, file_number):
    map_of_abstract_summary = {}
    
    abstract_map = {}
    for i, abstract in enumerate(abstract_list):
        abstract_map[i] = abstract
    
    summary_map = {}
    for i, summary in enumerate(summary_list):
        summary_map[i] = summary
    
    with open("data/map/abstract_file_" + str(file_number) + ".pickle", 'wb') as abs_file:
        pickle.dump(abstract_map, abs_file, protocol=pickle.HIGHEST_PROTOCOL)
    with open("data/map/summary_file_" + str(file_number) + ".pickle", 'wb') as sum_file:
        pickle.dump(summary_map, sum_file, protocol=pickle.HIGHEST_PROTOCOL)

#### Call this function and this function is now changed to generate summaries for all articles and store them to files

In [51]:
list_of_sentences, sentence_metadata = get_sentences_with_metadata(data_map)



  0%|          | 0/20000 [00:00<?, ?it/s][A[A

  0%|          | 1/20000 [00:02<14:17:21,  2.57s/it][A[A

  0%|          | 2/20000 [00:04<12:34:21,  2.26s/it][A[A

  0%|          | 3/20000 [00:07<14:42:03,  2.65s/it][A[A

  0%|          | 4/20000 [00:09<13:08:51,  2.37s/it][A[A

  0%|          | 5/20000 [00:17<22:47:52,  4.10s/it][A[A

  0%|          | 6/20000 [00:22<23:25:12,  4.22s/it][A[A

  0%|          | 7/20000 [00:23<19:36:12,  3.53s/it][A[A

  0%|          | 8/20000 [00:26<17:10:33,  3.09s/it][A[A

  0%|          | 9/20000 [00:26<13:10:22,  2.37s/it][A[A

  0%|          | 10/20000 [00:35<23:29:35,  4.23s/it][A[A

File  1  done


In [24]:
list_of_sentences, sentence_metadata = get_sentences_with_metadata(data_map)
list_of_sentences = [sentence.strip() for sentence in list_of_sentences if len(sentence) > 1]


  0%|          | 0/20000 [00:00<?, ?it/s][A
  0%|          | 1/20000 [00:01<9:28:52,  1.71s/it][A
  0%|          | 2/20000 [00:02<8:35:29,  1.55s/it][A
  0%|          | 3/20000 [00:05<11:10:21,  2.01s/it][A
  0%|          | 4/20000 [00:07<10:16:29,  1.85s/it][A
  0%|          | 5/20000 [00:14<19:29:25,  3.51s/it][A
  0%|          | 6/20000 [00:18<20:31:07,  3.69s/it][A
  0%|          | 7/20000 [00:20<17:16:36,  3.11s/it][A
  0%|          | 8/20000 [00:22<14:54:40,  2.69s/it][A
  0%|          | 9/20000 [00:22<11:23:55,  2.05s/it][A
  0%|          | 10/20000 [00:30<20:26:34,  3.68s/it][A

File  1  done


In [38]:
processed_sentences = make_processed_sentences(list_of_sentences)

[]

In [26]:
sentence_graph, sentence_common_graph = make_graph(processed_sentences, sentence_metadata)

In [27]:
sentence_scores = calculate_scores(sentence_graph)

In [28]:
sentence_page_scores = calculate_pagerank_scores(sentence_common_graph)

In [29]:
sentence_score_final = [sentence_scores[i] * (sentence_page_scores[i]+1)  for i in range(len(sentence_scores))]

In [30]:
summary_length = 10
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_scores, summary_length)
summary

IndexError: index -1 is out of bounds for axis 0 with size 0

In [36]:
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_score_final, summary_length)
summary

IndexError: index -1 is out of bounds for axis 0 with size 0

In [None]:
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_page_scores, summary_length)
summary
# make_processed_sentences(summary)

In [47]:
# Loading function
data_path = './data/map'
with open(data_path + "/" + "summary_file_1.pickle", 'rb') as handle:
    summary_map = pickle.load(handle)
len(summary_map)

10