#text summarisation using word frequency

In [None]:
#word frequency text summarization
from collections import Counter
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords 
from string import punctuation
from heapq import nlargest

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#compute the word frequecy
#comput_word_frequencies()-calculate the frequency of each word as a proportion of the frequency of the most common word
#MIN_WORD_PROP-proportion is lesser
#MAX_WORD_PROP-proportion is greater
STOPWORDS=set(stopwords.words('english')+list(punctuation))
MIN_WORD_PROP,MAX_WORD_PROP=0.1,0.9
def compute_word_frequencies(word_sentences):
  words=[word for sentence in word_sentences for word in sentence if word not in STOPWORDS]
  counter=Counter(words)
  limit=float(max(counter.values()))
  word_frequencies={word:freq/limit for word,freq in counter.items()}
  #drop words if too common or too uncommon
  word_frequencies={word:freq for word,freq in counter.items() if freq>MIN_WORD_PROP and freq<MAX_WORD_PROP}
  return word_frequencies

In [None]:
#find the score of each sentence
def sentence_score(word_sentence,word_frequencies):
  return sum([word_frequencies.get(word,0) for word in word_sentence])

In [None]:
#summarize the text and return the top three sentences
def summarize(text:str,num_sentences=3):
  text=text.lower()
  sentences=sent_tokenize(text)
  word_sentences=[word_tokenize(sentence) for sentence in sentences]
  word_frequencies=compute_word_frequencies(word_sentences)
  scores=[sentence_score(word_sentence,word_frequencies) for word_sentence in word_sentences]
  sentence_scores=list(zip(sentences,scores))
  #rank the sentences
  top_sentence_scores=nlargest(num_sentences,sentence_scores,key=lambda t:t[1])
  #return the top sentences
  return [t[0] for t in top_sentence_scores]

In [None]:
with open('/content/articles.txt','r') as art_file:
  article=art_file.read()

In [None]:
article



In [None]:
len(sent_tokenize(article))

33714

In [None]:
summarize(article)

["barclays' defiance of us fines has merit barclays disgraced itself in many ways during the pre-financial crisis boom years.",
 'so it is tempting to think the bank, when asked by us department of justice to pay a large bill for polluting the financial system with mortgage junk between 2005 and 2007, should cough up, apologise and learn some humility.',
 'that is not the view of the chief executive, jes staley.']

In [None]:
summarize(article,num_sentences=1)

["barclays' defiance of us fines has merit barclays disgraced itself in many ways during the pre-financial crisis boom years."]