In [1]:
import os
import sys
import json
import pickle
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
import numpy as np
import pandas as pd

In [3]:
def read_data_and_split(train_corpus_path):
    """
    Read the full training data and split it into smaller chunks
    :param train_corpus_path: input path
    :return:
    """
    train_data_map = {}
    file_no = 0
    with open(train_corpus_path, 'r') as train_data_file:
        line_count = 0
        while file_no < 11:
            if line_count < 20000:
                line_data = train_data_file.readline()
                if line_data:
                    line_map = json.loads(line_data)
                    article_id = line_map['article_id']
                    del line_map['article_id']
                    train_data_map[article_id] = line_map
                    line_count += 1
                else:
                    break
            else:
                with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
                    pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)
                print("File ", file_no, " Done")
                train_data_map.clear()
                file_no += 1
                line_count = 0
        with open(train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain/" + "train_" + str(file_no) + ".pickle", 'wb') as train_file:
            pickle.dump(train_data_map, train_file, protocol=pickle.HIGHEST_PROTOCOL)

In [46]:
train_corpus_path = "/media/kaushik/Studies/IIITH/3_ThirdSem/IRE/Major Project/arxiv-release/arxiv-release/train.txt"
# read_data_and_split(train_corpus_path)

In [6]:
data_path = train_corpus_path.rsplit('/', 1)[0] + "/SplitTrain"
data_map = {}
with open(data_path + "/" + "train_0.pickle", 'rb') as handle:
    data_map = pickle.load(handle)

In [8]:
full_text = []
for article_id, data in data_map.items():
    for part, article_data in data.items():
        if part != 'labels' and part != 'section_names':
            full_text.append(article_data)
    break
list_of_sentences = [word for line in full_text for word in line]

In [9]:
def is_ascii(word):
    """
    Checks if word is ascii or not
    :param word: token
    :return: Boolean
    """
    valid = True
    try:
        word = word.encode('ascii')
    except UnicodeEncodeError:
        valid = False
    return valid

In [10]:
def get_processed_tokens(sentence):
    punc_map = {}
    punc_map = punc_map.fromkeys('!"\'()*+,;<>[\\]^`{|}~:=%&_#?-$/', ' ')
    table = str.maketrans(punc_map)
    tokens = sentence.lower().translate(table).split()
    stop_words = set(stopwords.words('english')) 
    cleaned_tokens = [word for word in tokens if word not in stop_words and is_ascii(word) and '@' not in word and len(word) > 1]            
    return cleaned_tokens

In [11]:
def make_processed_sentences(list_of_sentences):
    processed_sentences = []
    for sentence in list_of_sentences:
        if isinstance(sentence, list):
            sentence = " ".join(sentence)
        processed_sentences.append(get_processed_tokens(sentence))
    return processed_sentences

In [22]:
def get_no_of_common_word(sentence1, sentence2):
    common_count = 0
    for s1 in sentence1:
        for s2 in sentence2:
            if s1 == s2:
                common_count += 1
    return common_count

In [43]:
def make_graph(processed_sentences):
    sentence_graph = np.zeros(shape=(len(processed_sentences), len(processed_sentences)))
    for i in range(len(processed_sentences)):
        for j in range(len(processed_sentences)):
            sentence1 = processed_sentences[i]
            sentence2 = processed_sentences[j]
            if i == j:
                sentence_graph[i][j] = 0
            else:
                sentence_graph[i][j] = get_no_of_common_word(sentence1, sentence2)
    return sentence_graph

In [54]:
def calculate_scores(sentence_graph):
    scores = np.zeros(len(sentence_graph))
    for i,sentence in enumerate(sentence_graph):
        scores[i] = sum(sentence_graph[i])
    return scores

[26. 21. 18. 33.]


In [58]:
def rank_sentences_and_make_summary(processed_sentences, sentence_graph, scores):
    pass

In [60]:
processed_sentences = make_processed_sentences(list_of_sentences)
sentence_graph = make_graph(processed_sentences)
sentence_scores = calculate_scores_and_rank(sentence_graph)
summary = rank_sentences_and_make_summary(processed_sentences, sentence_graph, sentence_scores)

[4.120e+02 2.460e+02 3.560e+02 1.095e+03 6.460e+02 5.040e+02 1.146e+03
 8.220e+02 2.000e+02 5.000e+01 2.000e+01 2.800e+02 1.120e+02 2.520e+02
 2.400e+02 2.190e+02 1.000e+02 5.360e+02 2.150e+02 1.500e+02 3.300e+02
 1.540e+02 3.900e+01 1.380e+02 1.880e+02 3.200e+01 6.720e+02 8.500e+01
 5.400e+01 7.650e+02 5.500e+01 2.120e+02 1.440e+02 3.900e+01 4.630e+02
 1.030e+02 3.500e+01 5.010e+02 2.300e+01 1.500e+02 4.700e+01 4.010e+02
 8.600e+01 1.700e+02 1.360e+02 5.290e+02 3.420e+02 1.050e+02 5.600e+01
 1.060e+02 9.200e+01 1.220e+02 1.750e+02 2.750e+02 2.330e+02 2.480e+02
 2.430e+02 6.100e+01 9.100e+01 1.760e+02 2.800e+01 1.011e+03 2.800e+02
 3.200e+02 1.100e+02 3.370e+02 2.970e+02 1.730e+02 1.070e+02 8.860e+02
 5.400e+01 3.200e+01 2.100e+02 2.900e+01 4.040e+02 1.750e+02 1.170e+02
 1.520e+02 3.020e+02 1.340e+02 9.140e+02 1.920e+02 1.410e+02 1.870e+02
 8.700e+01 1.350e+02 1.430e+02 1.680e+02 5.200e+02 8.800e+02 2.770e+02
 5.660e+02 4.900e+01 4.360e+02 3.600e+01 2.700e+02 2.330e+02 9.900e+01
 1.270