In [5]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
import re
import os
import math
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet

from collections import defaultdict
from rouge import Rouge
from pprint import pprint

In [2]:
def preprocess_text(text, tokens=True, stop_words=True):
    stop_words = set(stopwords.words('english'))
    clean_text = re.sub(r'[^a-zA-Z0-9 ]+', '', text)
    words = word_tokenize(clean_text)
    
    if stop_words:
        tokens_list = [word.lower() for word in words if word.lower() not in stop_words]
    elif not stop_words:
        tokens_list = [word.lower() for word in words]

    if tokens:
        return tokens_list
    elif not tokens:
        return ' '.join(tokens_list)

In [8]:
def compute_word_distribution(words):
    unique_words, counts = np.unique(words, return_counts=True)
    return dict(zip(unique_words, counts / float(len(words))))

def get_KL_summary(text, eps=1e-10, summary_size=3):
    
    doc_words = preprocess_text(text)
    doc_word_dist = compute_word_distribution(doc_words)

    sentences = sent_tokenize(text)
    sentence_scores = []

    for sentence in sentences:
        sentence_words = preprocess_text(sentence)
        sentence_word_dist = compute_word_distribution(sentence_words)

        sentence_score = 0
        for word in doc_words:
            if word in sentence_word_dist:
                p_sent = sentence_word_dist[word]
            else:
                p_sent = eps
            p_doc = doc_word_dist.get(word, 0)
            sentence_score += p_doc * math.log(p_doc / (p_sent+eps))

        sentence_scores.append(sentence_score)

    best_sentences_idx = np.argsort(sentence_scores)[:summary_size]
    summary = ' '.join([sentences[i] for i in best_sentences_idx])

    return summary

In [43]:
df = pd.read_csv('cleaned_output.csv')
a = df['text'].iloc[250]
a


'DYNAMITE LOS ANGELES "TIMES" Many Employes Killed and Los Angeles Times Build ing a Total Loss With Property Damage to Amount of Half a Million—Loss is Accredited to Labor Unions With Which it Had Waged an Unceasing Warfare. Los Angeles, CaL, Oct. I.—The building and plain of the Los Angeles Times, one of the best known newspa pers of the southwest, of which General Harrison Gray Otis is editor and principal owner, was eompeltely de stroyed by an explosion and tire shortly after 1 oYlock this morning, resulting in the death of upwards of twen ty employes and a financial loss of nearly half a million dollars. The management of the paper lays the blame of the explosion on the labor unions, with which organ izations the paper lias long been engaged in a bitter warfare. It is charged that unknown par-1 ties set off a heavy charge of dvna-1 mite in a blind alley which ran into j the center of the building, complete ly wrecking the interior and setting it on fire from roof to basement. The 

In [44]:
num_sentences = 1

predicted_summary_kl = get_KL_summary(a, 0.000001, num_sentences)

print(f"Original Text {i+1}: \n{a}...")
print(len(a))
print("\n---------------------------------\n")
print("\nKL Summary:")
print(predicted_summary_kl)
print(len(predicted_summary_kl))


Original Text 13521: 
DYNAMITE LOS ANGELES "TIMES" Many Employes Killed and Los Angeles Times Build ing a Total Loss With Property Damage to Amount of Half a Million—Loss is Accredited to Labor Unions With Which it Had Waged an Unceasing Warfare. Los Angeles, CaL, Oct. I.—The building and plain of the Los Angeles Times, one of the best known newspa pers of the southwest, of which General Harrison Gray Otis is editor and principal owner, was eompeltely de stroyed by an explosion and tire shortly after 1 oYlock this morning, resulting in the death of upwards of twen ty employes and a financial loss of nearly half a million dollars. The management of the paper lays the blame of the explosion on the labor unions, with which organ izations the paper lias long been engaged in a bitter warfare. It is charged that unknown par-1 ties set off a heavy charge of dvna-1 mite in a blind alley which ran into j the center of the building, complete ly wrecking the interior and setting it on fire from r