In [2]:
import nltk
nltk.download('punkt')      
nltk.download('stopwords') 


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\udday\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\udday\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import string


nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\udday\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\udday\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def preprocess_text(text):
   
    sentences = sent_tokenize(text)
    
    words = [word_tokenize(sentence.lower()) for sentence in sentences]
    
    stop_words = set(stopwords.words('english'))
    words = [[word for word in sentence if word not in stop_words and word not in string.punctuation] for sentence in words]
    return sentences, words


In [5]:
def calculate_word_frequencies(words):
  
    all_words = [word for sentence in words for word in sentence]
    word_freq = Counter(all_words)
    return word_freq


In [6]:
def identify_significant_words(word_freq, threshold=1):
  
    significant_words = {word for word, freq in word_freq.items() if freq > threshold}
    return significant_words


In [7]:
def score_sentences(sentences, words, significant_words):
    sentence_scores = []
    for i, sentence_words in enumerate(words):
       
        clusters = []
        cluster = []
        for word in sentence_words:
            if word in significant_words:
                cluster.append(word)
            else:
                if cluster:
                    clusters.append(cluster)
                    cluster = []
        if cluster:
            clusters.append(cluster)
       
        score = sum(len(cluster)**2 for cluster in clusters)
        sentence_scores.append((score, sentences[i]))
    return sentence_scores


In [8]:
def luhn_summarize(text, summary_length=3, threshold=1):
    sentences, words = preprocess_text(text)
    word_freq = calculate_word_frequencies(words)
    significant_words = identify_significant_words(word_freq, threshold)
    sentence_scores = score_sentences(sentences, words, significant_words)
    
    ranked_sentences = sorted(sentence_scores, key=lambda x: x[0], reverse=True)
    
    summary = ' '.join([sentence for score, sentence in ranked_sentences[:summary_length]])
    return summary
