In [15]:
# summarize.py

# -*- coding: utf-8 -*-
"""summarize.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1y4HxABEY_nyJrfR-6zG015fyu-Z0zakJ
"""

import numpy as np
import heapq
import logging
import nltk
import argparse
import re
import os
from summarizer.model_processors import SingleModel

class ModelSelector(object):
    min_length = 40
    # init method or constructor
    def __init__(self):
    # choose BERT or vanilla summarizer
        choose_bert_or_vanilla = '1'
        # choose_bert_or_vanilla = input('Please enter 1 to use the BERT summarizer or 2 for the Vanilla summarizer:\n')

        # BERT summarizer
        if choose_bert_or_vanilla == '1':
            print('Welcome to the BERT Summarizer!\n')
            self.process_documents("bert")
            print('Processing finished...!')

        # Vanilla summarizer
        elif choose_bert_or_vanilla == '2':
            print('Welcome to the Vanilla Summarizer!\n')
            self.process_documents("vanilla")
            print('Processing finished...!')
        else:
            print('\nMust choose from 1 or 2')

    def process_documents(self,model_type):
        path = os.getcwd()+'/../Data/summarized_data/content'
        categories = os.listdir(path)
        for category in categories:
            category_path = path+'/'+category
            # Check whether a path pointing to a file
            if os.path.isfile(category_path) == False:
                documents = os.listdir(category_path)
                for document in documents:
                    print('Doc ', document)
                    self.summarize_text(model_type, document, category_path)


    def summarize_text(self, model_type, doc, path):
        # print(model,doc,path)
        document = path+'/'+doc
        # reading in text file

        with open(document, 'r') as d:
            text_data = d.read()
        # print('text:',path,len(text_data))
        
        if os.path.getsize(document) <self.min_length or len(text_data)<self.min_length:
            summary = "File was empty!"
        else:
            # Passing full text to model
            if model_type == "bert":
                model = SingleModel()
                summary = model(text_data)
            else:
                summary = self.vanilla(text_data)


        # creating final summary with a ratio of 0.13
        summary_file = '\n\nSUMMARY:\n' + summary

        folders = document.split('/')
        filepath = folders[-2]+'/'+folders[-1]
        write_path = os.getcwd()+'/../Data/summarized_data/summaries/'+filepath
        with open(write_path, 'w+') as summary_output:
            for line in summary_file:
                summary_output.write(line)


    def vanilla(self,text_data):

        # text clean up
        text_data = re.sub(r'\[[0-9]*\]', ' ', text_data)
        text_data = re.sub(r'\s+', ' ', text_data)

        processed_article = re.sub('[^a-zA-Z]', ' ', text_data)
        processed_article = re.sub(r'\s+', ' ', processed_article)

        # sentence-level tokenization of full text
        sentence_list = nltk.sent_tokenize(text_data)

        # NLTK stopword list
        stopwords = nltk.corpus.stopwords.words('english')

        # creating term frequency dict
        word_frequencies = {}
        for word in nltk.word_tokenize(processed_article):
            if word not in stopwords:
                if word not in word_frequencies.keys():
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1

        maximum_frequency = max(word_frequencies.values())

        # adding term frequency ratios as dict values
        for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

        # ranking sentences for summary inclusion
        sentence_scores = {}
        for sent in sentence_list:
            for word in nltk.word_tokenize(sent.lower()):
                if word in word_frequencies.keys():
                    if len(sent.split(' ')) < 30:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word]
                        else:
                            sentence_scores[sent] += word_frequencies[word]

        # creating final summary with default 4 highest-scoring sentences
        summary_sentences = heapq.nlargest(
            4, sentence_scores, key=sentence_scores.get)
        summary_sentences = ''.join(summary_sentences)
        return summary_sentences

# ModelSelector()