In [103]:
import os
import sys
import json
import pickle
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd

In [104]:
def read_data_and_split(train_corpus_path):
    """
    Read the full training data and split it into smaller chunks
    :param train_corpus_path: input path
    :return:
    """
    train_data_map = {}
    file_no = 0
    with open(train_corpus_path, 'r') as train_data_file:
        line_count = 0
        while file_no < 11:
            if line_count < 20000:
                line_data = train_data_file.readline()
                if line_data:
                    line_map = json.loads(line_data)
                    article_id = line_map['article_id']
                    del line_map['article_id']
                    train_data_map[article_id] = line_map
                    line_count += 1
                else:
                    break
            else:
                with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
                    pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)
                print("File ", file_no, " Done")
                train_data_map.clear()
                file_no += 1
                line_count = 0
        with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
            pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)

In [105]:
train_corpus_path = "/media/kaushik/Studies/IIITH/3_ThirdSem/IRE/Major Project/arxiv-release/arxiv-release/train.txt"
# read_data_and_split(train_corpus_path)

In [106]:
data_path = train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain"
data_map = {}
with open(data_path + "/" + "train_0.pickle", 'rb') as handle:
    data_map = pickle.load(handle)

In [107]:
def get_sentences_with_metadata(data_map):
    full_text = []
    sentence_metadata = []
    list_of_sentences = []
    c = 0
    for article_id, data in data_map.items():
        if c == 50:
            section_data = data['sections']
            section_names = data['section_names']
            for i, section in enumerate(section_data):
                for line in section:
                    split_line = line.split('.')
                    for l in split_line:
                        list_of_sentences.append(l)
                        sentence_metadata.append(section_names[i])
        c += 1
    return list_of_sentences, sentence_metadata

In [108]:
def is_ascii(word):
    """
    Checks if word is ascii or not
    :param word: token
    :return: Boolean
    """
    valid = True
    try:
        word = word.encode('ascii')
    except UnicodeEncodeError:
        valid = False
    return valid

In [109]:
def get_processed_tokens(sentence):
    punc_map = {}
    punc_map = punc_map.fromkeys('!"\'()*+,;<>[\\]^`{|}~:=%&_#?-$/', ' ')
    table = str.maketrans(punc_map)
    tokens = sentence.lower().translate(table).split()
    stop_words = set(stopwords.words('english')) 
    cleaned_tokens = [word for word in tokens if word not in stop_words and is_ascii(word) and '@' not in word and len(word) > 1]            
    return cleaned_tokens

In [110]:
def make_processed_sentences(list_of_sentences):
    processed_sentences = []
    for sentence in list_of_sentences:
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
        processed_sentences.append(get_processed_tokens(sentence))
    return processed_sentences

In [111]:
def get_no_of_common_word(sentence1, sentence2):
    common_count = 0
    for s1 in sentence1:
        for s2 in sentence2:
            if s1 == s2:
                common_count += 1
    return common_count

In [112]:
def scoring(sentence1, sentence2, metadata):
    common_words = get_no_of_common_word(sentence1, sentence2)
    
    score = common_words
    return score

In [113]:
def make_graph(processed_sentences, metadata):
    sentence_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    for i in range(len(processed_sentences)):
        for j in range(len(processed_sentences)):
            sentence1 = processed_sentences[i]
            sentence2 = processed_sentences[j]
            if i == j:
                sentence_graph[i][j] = 0
            else:
                sentence_graph[i][j] = scoring(sentence1, sentence2, metadata)
    return sentence_graph

In [114]:
def calculate_scores(sentence_graph):
    scores = np.zeros(len(sentence_graph))
    for i,sentence in enumerate(sentence_graph):
        scores[i] = sum(sentence_graph[i])
    return scores

In [115]:
def rank_sentences_and_make_summary(sentences, processed_sentences, sentence_graph, scores):
    scores_indices = np.argsort(scores)
    ordered_sentences = scores_indices[::-1]
    print(scores)
    print(len(sentences))
    for i in range(3):
        print(ordered_sentences[i], scores[ordered_sentences[i]])
        print(sentences[ordered_sentences[i]])
        print(processed_sentences[ordered_sentences[i]])

In [116]:
list_of_sentences, sentence_metadata = get_sentences_with_metadata(data_map)
processed_sentences = make_processed_sentences(list_of_sentences)
sentence_graph = make_graph(processed_sentences, sentence_metadata)
sentence_scores = calculate_scores(sentence_graph)
summary = rank_sentences_and_make_summary(list_of_sentences, processed_sentences, sentence_graph, sentence_scores)
# summary = rank_sentences_and_make_summary(processed_sentences, sentence_graph, sentence_scores)

[123.   0.  48.   0. 149.   0.  58.   0. 259. 136.   0.  39.   0. 126.
  17.   0. 278.   0.   6. 110.   0.  66.   0.   6.   0.   2. 189.   0.
  83.  53.  16.   0.   0.   4.  77.   0.  15.   5.  63.   0. 127.   0.
  22.   0.  82.  19.  46.   0.  24.   0.  44.   0.  46.   0. 199.   0.
   5. 357.   0. 227.   0. 127.   0. 184.   0. 150.   0.   0.  77.   0.
  41.   0.  12.   0.  13.   0.   4.   7.   0.  39.  77.  33.   0.  41.
   0.  61.   0.  42.   0. 180.  24. 165.   2.  62.   0.   3.   0.  33.
   3.   0.  39.   0.  16.   0.   1.   3.   0.   2.   0. 200.   0.  36.
   0.  31.   0. 188. 106.   0.  63.  39. 281.   0. 142.   0.  75.   0.
  55.   6.  68.   0. 158.   0.  26.  21. 145.   0.  71.  14.   0. 137.
  21.   0. 217.   0. 203.   0.  44.   0.  60.   0.   0.   0.  25.  18.
  51.   0.  38.   0.  16.   0.   0.   7. 125.   0.  45.   0.   0. 169.
   0. 275. 180.   9.   0.  75.  55.  46.   0.  26.  50. 105.  22. 127.
  92.   0. 200.   0.  94. 226.   0. 253.   0. 100.   0.  76.   0. 137.
   0. 