In [69]:
import os
import sys
import json
import pickle
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd

In [70]:
def read_data_and_split(train_corpus_path):
    """
    Read the full training data and split it into smaller chunks
    :param train_corpus_path: input path
    :return:
    """
    train_data_map = {}
    file_no = 0
    with open(train_corpus_path, 'r') as train_data_file:
        line_count = 0
        while file_no < 11:
            if line_count < 20000:
                line_data = train_data_file.readline()
                if line_data:
                    line_map = json.loads(line_data)
                    article_id = line_map['article_id']
                    del line_map['article_id']
                    train_data_map[article_id] = line_map
                    line_count += 1
                else:
                    break
            else:
                with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
                    pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)
                print("File ", file_no, " Done")
                train_data_map.clear()
                file_no += 1
                line_count = 0
        with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
            pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)

In [71]:
train_corpus_path = "/media/kaushik/Studies/IIITH/3_ThirdSem/IRE/Major Project/arxiv-release/arxiv-release/train.txt"
# read_data_and_split(train_corpus_path)

In [72]:
data_path = train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain"
data_map = {}
with open(data_path + "/" + "train_0.pickle", 'rb') as handle:
    data_map = pickle.load(handle)

In [73]:
def get_sentences_with_metadata(data_map):
    full_text = []
    sentence_metadata = []
    list_of_sentences = []
    for article_id, data in data_map.items():
        section_data = data['sections']
        section_names = data['section_names']
        for i, section in enumerate(section_data):
            for line in section:
                list_of_sentences.append(line)
                sentence_metadata.append(section_names[i])
        break
    return list_of_sentences, sentence_metadata

In [74]:
def is_ascii(word):
    """
    Checks if word is ascii or not
    :param word: token
    :return: Boolean
    """
    valid = True
    try:
        word = word.encode('ascii')
    except UnicodeEncodeError:
        valid = False
    return valid

In [75]:
def get_processed_tokens(sentence):
    punc_map = {}
    punc_map = punc_map.fromkeys('!"\'()*+,;<>[\\]^`{|}~:=%&_#?-$/', ' ')
    table = str.maketrans(punc_map)
    tokens = sentence.lower().translate(table).split()
    stop_words = set(stopwords.words('english')) 
    cleaned_tokens = [word for word in tokens if word not in stop_words and is_ascii(word) and '@' not in word and len(word) > 1]            
    return cleaned_tokens

In [76]:
def make_processed_sentences(list_of_sentences):
    processed_sentences = []
    for sentence in list_of_sentences:
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
        processed_sentences.append(get_processed_tokens(sentence))
    return processed_sentences

In [77]:
def get_no_of_common_word(sentence1, sentence2):
    common_count = 0
    for s1 in sentence1:
        for s2 in sentence2:
            if s1 == s2:
                common_count += 1
    return common_count

In [78]:
def make_graph(processed_sentences):
    sentence_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    for i in range(len(processed_sentences)):
        for j in range(len(processed_sentences)):
            sentence1 = processed_sentences[i]
            sentence2 = processed_sentences[j]
            if i == j:
                sentence_graph[i][j] = 0
            else:
                sentence_graph[i][j] = get_no_of_common_word(sentence1, sentence2)
    return sentence_graph

In [79]:
def calculate_scores(sentence_graph):
    scores = np.zeros(len(sentence_graph))
    for i,sentence in enumerate(sentence_graph):
        scores[i] = sum(sentence_graph[i])
    return scores

In [86]:
def rank_sentences_and_make_summary(sentences, sentence_graph, scores):
    scores_indices = np.argsort(scores)
    ordered_sentences = scores_indices[::-1]
    print(scores)
    for i in range(3):
        print(ordered_sentences[i], scores[ordered_sentences[i]])
        print(sentences[ordered_sentences[i]])

In [87]:
list_of_sentences, sentence_metadata = get_sentences_with_metadata(data_map)
processed_sentences = make_processed_sentences(list_of_sentences)
sentence_graph = make_graph(processed_sentences)
sentence_scores = calculate_scores(sentence_graph)
summary = rank_sentences_and_make_summary(list_of_sentences, sentence_graph, sentence_scores)
# summary = rank_sentences_and_make_summary(processed_sentences, sentence_graph, sentence_scores)

[166. 106. 154. 474. 292. 223. 503. 371.  90.  18.   8. 125.  51. 113.
 108. 102.  42. 235. 100.  69. 149.  71.  17.  64.  76.   8. 295.  37.
  25. 329.  25.  90.  66.  18. 202.  46.  15. 229.   9.  67.  20. 179.
  38.  78.  64. 242. 149.  48.  25.  50.  42.  56.  82. 127. 107. 111.
 112.  28.  42.  79.  12. 434. 127. 145.  47. 154. 137.  80.  51. 396.
  19.  15.  98.  14. 186.  80.  52.  57. 139.  60. 409.  87.  60.  88.
  39.  62.  63.  73. 236. 391. 129. 260.  23. 195.  14. 124. 102.  46.
  56. 319.  59.  77. 177. 258. 116. 227.  13.   7.  79.  72.   1.   7.
  53.  21.   0. 305.  77.  17.  10.  51.  12.  35. 315. 102.   4.  52.
   1.   0.]
6 503.0
such regularized kernel based methods are now often called support vector machines ( svms ) , although the notation was historically used for such methods based on the special hinge loss function and for special kernels only , we refer to @xcite .    in this paper we address the open question , whether an svm with an additive kernel can pr