In [1]:
import pandas as pd

In [36]:
import nltk
from nltk.cluster.util import cosine_distance
from scipy.special import comb
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
from sklearn.metrics.pairwise import pairwise_kernels

In [48]:
import lexrank
import itertools

from nltk.stem.wordnet import WordNetLemmatizer

import re

from sklearn.metrics.pairwise import cosine_similarity

In [43]:
## Lemmatizer can be replaced by PorterStemmer fot better time-efficiency
lemmatizer = WordNetLemmatizer()

def normalize_and_tokenize(text, stemmer = lemmatizer.lemmatize):
    tokenized_words = word_tokenize(text)
    filtered_words = [stemmer(word.replace(r'[^a-zA-Z]', '')) for word in tokenized_words]
    
    return (' '.join(filtered_words)).split('.')

In [59]:
def word_vectorize(sents, tfidf = True, ngram_range = (1, 1)):
    stop_words = nltk.corpus.stopwords.words('english')
    
    if tfidf:
        vectorizer = TfidfVectorizer(sublinear_tf = True, stop_words = stop_words, 
                                     analyzer = 'word', lowercase = True, ngram_range = ngram_range)
        return vectorizer.fit_transform(sents)

    else:
        vectorizer = CountVectorizer(tokenizer = word_tokenize, lowercase = True, 
                                     stop_words = stop_words, decode_error = 'ignore', 
                                     ngram_range = ngram_range)
        return vectorizer.fit_transform(sents)

In [1]:
def text_summarize(file_location, term_doc_matrix = None, req_sentences = 9, tfidf = False, ngram_range = (1,1), verbose = False, stopwords = None):
    with open(file_location, 'r') as file:
        data = file.readlines()
    text = data[0]
    if term_doc_matrix is None:
        if verbose: print('Reading document...')
        sents = sent_tokenize(text)
        if verbose: print('Fitting word vector...')
        term_doc_matrix = word_vectorize(sents, tfidf = tfidf, ngram_range = ngram_range)
    if verbose: print('Building similarity matrix...')
    similarity_matrix = pairwise_kernels(term_doc_matrix, metric = 'cosine')
    lexrank_object = lexrank.LexRank(sents, stopwords = lexrank.STOPWORDS['en'])
    text_summary = ' '.join(lexrank_object.get_summary(sents, summary_size = req_sentences))
    print('Text Summary of', req_sentences, 'lines :', '\n', text_summary)
    scores = sorted(lexrank_object.rank_sentences(sents), reverse = True)
    return {'term_doc_matrix' : term_doc_matrix, 'scores' : scores, 'text_summary' : text_summary}

In [63]:
text_summarize('./sample_text.txt', tfidf = True, verbose = True)

Reading document...
Fitting word vector...
Building similarity matrix...
Text Summary of 9 lines : 
 Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning.Some implementations of machine learning use data and neural networks in a way that mimics the working of a biological brain.In its application across business problems, machine learning is also referred to as predictive analytics. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. It is seen as a part of artificial intelligence.Machine learning algorithms build a model based on sample data, known as training data, in order to make predictions or decisions without being explicitly programmed to do so.Machine learning algorithms are used in a wide variety of applications, such as in medicine, email filtering, speech recognition, agriculture, and computer vision, where it is difficult or unfeasible to develop co