In [1]:
### This is text summarization.
### Objective, to summarize an article and make sense off of it.

In [2]:
from nltk.corpus import stopwords
import nltk
import re
import collections

In [3]:
def calculate_sentence_frequency(sentence, average_sentence_word_count):
    """
    Calculates the weighted frequency of a single sentence.
    Parameters:
    1. sentence. A string containing multiple words.
    Returns : word_frequencies (type = dict) list of words and associative weights.
    """
    word_frequencies = {}
    if len(sentence.split(" ")) < average_sentence_word_count:
        for word in nltk.word_tokenize(sentence):
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
        max_word_frequency = max(word_frequencies.values())   
        for word in word_frequencies.keys():
            word_frequencies[word] /= max_word_frequency
    return word_frequencies

In [4]:
def get_text_weighted_score(paragraph, average_word_count):
    """
    Generates the weighted score of the entire text.
    Uses calculate_sentence_frequency(paragraph[i]).
    Parameters:
    1. paragraph. A list of sentences.
    Returns:
    1. sentence_scores (type = dict) list of sentence and associative weights.
    """
    sentence_scores = {}
    for i, sent in enumerate(paragraph):
        word_frequencies = calculate_sentence_frequency(paragraph[i], average_word_count)
        for word in word_frequencies.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] =  word_frequencies[word]
            else:
                sentence_scores[sent] += word_frequencies[word]
    return sentence_scores

In [5]:
def main():
    STOPWORDS = set(stopwords.words('english'))
    STOPWORDS.add("-")
    
    ORIGINAL_TEXT = """
    A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution. Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work. The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power. As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers. The network itself requires minimal structure. Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.
    """
    
    TESLA_TEXT = ORIGINAL_TEXT.lower().replace(". ", " qwertyuiop")
    TESLA_TEXT = re.sub('[^a-zA-Z]', ' ', TESLA_TEXT )
    TESLA_TEXT = re.sub(r'\s+', ' ', TESLA_TEXT)
    TESLA_TEXT = TESLA_TEXT.split(" qwertyuiop")

    average_sentence_word_count = len(TESLA_TEXT)
    sum_word_count = 0
    for c,text in enumerate(TESLA_TEXT):
        TESLA_TEXT[c] = ' '.join([word for word in text.split() if word not in STOPWORDS])
        sum_word_count += len(TESLA_TEXT[c].split(" "))

    average_sentence_word_count = sum_word_count / average_sentence_word_count
    
    sentence_scores = get_text_weighted_score(TESLA_TEXT, average_sentence_word_count)
    original_dict = {}
    ORIGINAL_TEXT = ORIGINAL_TEXT.split(". ")
    for i, sentences in enumerate(sentence_scores.items()):
        original_dict[ORIGINAL_TEXT[i]] = sentences[1]
    sorted_sentences = sorted(original_dict.items(), key=lambda x: x[1], reverse=True)
    return sorted_sentences

In [6]:
if __name__ == "__main__":
    sorted_sentences = main()
    final_list = []
    for i, s in enumerate(sorted_sentences):
        final_list.append(s[0])