# Extractive Summarization

#### Required Libraries

In [103]:
import os
import sys
import json
import pickle
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd

#### Read the full training data and split it into smaller chunks

In [104]:
def read_data_and_split(train_corpus_path):
    train_data_map = {}
    file_no = 0
    with open(train_corpus_path, 'r') as train_data_file:
        line_count = 0
        while file_no < 11:
            if line_count < 20000:
                line_data = train_data_file.readline()
                if line_data:
                    line_map = json.loads(line_data)
                    article_id = line_map['article_id']
                    del line_map['article_id']
                    train_data_map[article_id] = line_map
                    line_count += 1
                else:
                    break
            else:
                with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
                    pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)
                print("File ", file_no, " Done")
                train_data_map.clear()
                file_no += 1
                line_count = 0
        with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
            pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)

#### Loads a particular pickle file of the training data into memory

In [125]:
def load_data_from_pickle(train_corpus_path):
    data_path = train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain"
    data_map = {}
    with open(data_path + "/" + "train_0.pickle", 'rb') as handle:
        data_map = pickle.load(handle)
    return data_map

#### Gets all the sentences of the article along with its metadata

In [107]:
def get_sentences_with_metadata(data_map):
    full_text = []
    sentence_metadata = []
    list_of_sentences = []
    c = 0
    for article_id, data in data_map.items():
        if c == 50:
            section_data = data['sections']
            section_names = data['section_names']
            for i, section in enumerate(section_data):
                for line in section:
                    split_line = line.split('.')
                    for l in split_line:
                        list_of_sentences.append(l)
                        sentence_metadata.append(section_names[i])
        c += 1
    return list_of_sentences, sentence_metadata

#### The Following 2 functions are used for Preprocessing of a given text

In [108]:
def is_ascii(word):
    """
    Checks if word is ascii or not
    :param word: token
    :return: Boolean
    """
    valid = True
    try:
        word = word.encode('ascii')
    except UnicodeEncodeError:
        valid = False
    return valid

In [109]:
def get_processed_tokens(sentence):
    punc_map = {}
    punc_map = punc_map.fromkeys('!"\'()*+,;<>[\\]^`{|}~:=%&_#?-$/', ' ')
    table = str.maketrans(punc_map)
    tokens = sentence.lower().translate(table).split()
    stop_words = set(stopwords.words('english')) 
    cleaned_tokens = [word for word in tokens if word not in stop_words and is_ascii(word) and '@' not in word and len(word) > 1]            
    return cleaned_tokens

#### Gets the processed sentences for each sentence of the article

In [110]:
def make_processed_sentences(list_of_sentences):
    processed_sentences = []
    for sentence in list_of_sentences:
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
        processed_sentences.append(get_processed_tokens(sentence))
    return processed_sentences

#### Gives the number of words common between given 2 sentences

In [111]:
def get_no_of_common_word(sentence1, sentence2):
    common_count = 0
    for s1 in sentence1:
        for s2 in sentence2:
            if s1 == s2:
                common_count += 1
    return common_count

#### Generic scoring function which gives a score between 2 sentences

In [112]:
def scoring(sentence1, sentence2, metadata):
    common_words = get_no_of_common_word(sentence1, sentence2)
    score = common_words
    return score

#### Makes the graph which has relations between every pair of sentences

In [113]:
def make_graph(processed_sentences, metadata):
    sentence_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    for i in range(len(processed_sentences)):
        for j in range(len(processed_sentences)):
            sentence1 = processed_sentences[i]
            sentence2 = processed_sentences[j]
            if i == j:
                sentence_graph[i][j] = 0
            else:
                sentence_graph[i][j] = scoring(sentence1, sentence2, metadata)
    return sentence_graph

#### Following functions are different ways to give a score to a sentence

##### (1) Aggregation

In [122]:
def calculate_scores(sentence_graph):
    scores = np.zeros(len(sentence_graph))
    for i,sentence in enumerate(sentence_graph):
        scores[i] = sum(sentence_graph[i])
    return scores

##### (2) Page Rank

In [142]:
def calculate_pagerank_scores(sentence_graph):
    N = len(sentence_graph)
    d = 0.15   # PageRank Hyperparameter
    pagerank_scores = np.ones(N)
    
    out_degree = np.zeros(N)
    for i in range(N):
        for j in range(N):
            if sentence_graph[i][j]:
                out_degree[i] += sentence_graph[i][j]
    
    for i in range(N):
        score = 0
        for j in range(N):
            if sentence_graph[j][i]:
                score += (pagerank_scores[j] / out_degree[j])
        pagerank_scores[i] = (d / N) + (1 - d) * score
    return pagerank_scores    

#### Ranks the sentences based on any one of the above scoring methods and return the Summary

In [123]:
def rank_sentences_and_make_summary(sentences, processed_sentences, sentence_graph, scores):
    scores_indices = np.argsort(scores)
    ordered_sentences = scores_indices[::-1]
    summary = []
    for i in range(5):
        summary.append(sentences[ordered_sentences[i]])
#         print(ordered_sentences[i], scores[ordered_sentences[i]])
#         print(processed_sentences[ordered_sentences[i]])
    return summary

#### Main Program which calls the above defined functions

In [133]:
train_corpus_path = "/media/kaushik/Studies/IIITH/3_ThirdSem/IRE/Major Project/arxiv-release/arxiv-release/train.txt"
#read_data_and_split(train_corpus_path)

In [127]:
data_map = load_data_from_pickle(train_corpus_path)

In [128]:
list_of_sentences, sentence_metadata = get_sentences_with_metadata(data_map)

In [129]:
processed_sentences = make_processed_sentences(list_of_sentences)

In [130]:
sentence_graph = make_graph(processed_sentences, sentence_metadata)

In [143]:
sentence_scores = calculate_scores(sentence_graph)

In [136]:
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_scores)
summary

['we see that for @xmath129 there exists an intermediate region between the critical point @xmath130 at which @xmath131 for the * af * phase , characterizing a second - order transition , and the point @xmath132 at which the @xmath133 order parameter presents a discontinuity for the * caf * phase , characterizing a first - order transition ',
 'et al__@xcite , by using coupled cluster treatment found the surprising and novel result that there exists a quantum triple point ( * qtp * ) with coordinates at ( @xmath43 ) , below which there is a second - order phase transition between the * af * and * caf * phases while above this * qtp * are these two ordered phases separated by the intermediate magnetically disordered phase ( vbs or rvb ) ',
 'we have observed , by analyzing the order parameters of the * af * and * caf * phases , that the phase transitions are of second and first - order between the * af - qp * and * caf - qp * , respectively ',
 'the frustration contributes significantly

In [144]:
sentence_scores = calculate_pagerank_scores(sentence_graph)

[6.50041675e-01 4.96688742e-04 6.23719663e-01 4.96688742e-04
 9.96786503e-01 4.96688742e-04 1.25145334e+00 4.96688742e-04
 1.32921149e+00 8.43446648e-01 4.96688742e-04 2.86605820e-01
 4.96688742e-04 7.94181331e-01 1.22868249e-01 4.96688742e-04
 7.11333475e-01 4.96688742e-04 1.50210974e-01 8.62790609e-01
 4.96688742e-04 6.37235565e-01 4.96688742e-04 4.88506231e-01
 4.96688742e-04 1.05118405e-01 5.83354531e-01 4.96688742e-04
 5.21756948e-01 2.85519537e-01 3.16063686e-01 4.96688742e-04
 4.96688742e-04 5.38624408e-02 4.08360140e-01 4.96688742e-04
 1.39058776e-01 6.04603047e-02 4.46673305e-01 4.96688742e-04
 6.79820418e-01 4.96688742e-04 3.50772298e-01 4.96688742e-04
 3.96716551e-01 1.99973208e-01 3.75375581e-01 4.96688742e-04
 3.08033432e-01 4.96688742e-04 3.19192524e-01 4.96688742e-04
 2.66493943e-01 4.96688742e-04 1.10119312e+00 4.96688742e-04
 5.31617901e-01 1.28955312e+00 4.96688742e-04 5.89912943e-01
 4.96688742e-04 2.77980302e-01 4.96688742e-04 8.67117240e-01
 4.96688742e-04 1.341977

In [145]:
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_scores)
summary

[' richter , _ phys ',
 ' therefore , in the limit of the not frustrated ( @xmath136 ) square lattice ( @xmath67 ) antiferromagnetic , solving the equations ( 12 ) and applying the corrections factor we found @xmath146 which is consistent with the numerical results obtained by various methods such as series expansion , quantum monte carlo simulation , and others@xcite , and can also be compared with experimental results for the k@xmath24nif@xmath25 , k@xmath24mnf@xmath25 , and rb@xmath147mnf@xmath25 compounds@xcite ',
 'these results are in accordance with results obtained by starykh and balentes@xcite , that predicted not the * qtp * in the ground - state phase diagram recently observed by bishop , _ _ et al',
 'there are two magnetically long - range ordered phases at small and at large values of @xmath6 separated by an intermediate quantum paramagnetic phase without magnetic long - range order in the region between @xmath14 and @xmath15 , where the properties of these disordered pha