In [114]:
import nltk
import string
import numpy as np

In [115]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize as nlkt_sent_tokenize
from nltk.tokenize import word_tokenize as nlkt_word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords
from scipy.spatial.distance import cosine

In [116]:
# Ensure required NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\K Ravinder
[nltk_data]     Reddy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\K Ravinder
[nltk_data]     Reddy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [117]:
# Cosine similarity score between two vectors
def similarity(v1, v2): 
    score = 0.0 
    if np.count_nonzero(v1) != 0 and np.count_nonzero(v2) != 0:
        score = ((1 - cosine(v1, v2)) + 1) / 2  # Scale from [-1,1] to [0,1]
    return score

In [118]:
# Tokenize text into raw sentences
def sent_tokenize(text): 
    sents = nlkt_sent_tokenize(text) 
    print("Raw Sentences:", sents)
    return sents


In [119]:
# Clean and tokenize sentences
def cleanup_sentences(text): 
    stop_words = set(stopwords.words('english')) 
    sentences = nlkt_sent_tokenize(text) 
    sentences_cleaned = [] 
    for sent in sentences: 
        words = nlkt_word_tokenize(sent) 
        words = [w for w in words if w not in string.punctuation] 
        words = [w.lower() for w in words if w.lower() not in stop_words or w.lower() in ['ai', 'machine', 'healthcare']]  # keep domain-specific words
        sentences_cleaned.append(" ".join(words)) 
    print("Clean Sentences:", sentences_cleaned)
    return sentences_cleaned

In [120]:
# Get TF-IDF based relevant words
def get_tf_idf(sentences):
    if not sentences:
        return []
    
    vectorizer = CountVectorizer()
    sent_word_matrix = vectorizer.fit_transform(sentences)

    transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
    tfidf = transformer.fit_transform(sent_word_matrix).toarray()

    centroid_vector = tfidf.sum(0)
    centroid_vector = np.divide(centroid_vector, centroid_vector.max())

    feature_names = vectorizer.get_feature_names_out()
    relevant_vector_indices = np.where(centroid_vector > 0.3)[0]

    word_list = list(feature_names[relevant_vector_indices])

    print("Centroid Words:", word_list)
    return word_list

In [121]:
# Build lookup dictionary of word embeddings
def word_vectors_cache(sentences, embedding_model): 
    word_vectors = dict() 
    for sent in sentences: 
        words = nlkt_word_tokenize(sent) 
        for w in words: 
            w = w.lower()
            if w in embedding_model.wv:
                word_vectors[w] = embedding_model.wv[w] 

    print("Loaded Word Vectors:", len(word_vectors))
    return word_vectors



In [122]:

# Build average embedding for a sentence
def build_embedding_representation(words, word_vectors, embedding_model): 
    embedding_dim = embedding_model.vector_size
    embedding_representation = np.zeros(embedding_dim, dtype="float32") 
    count = 0 

    for w in words: 
        w = w.lower()
        if w in word_vectors: 
            embedding_representation += word_vectors[w]
            count += 1 

    if count != 0: 
        embedding_representation = np.divide(embedding_representation, count) 
            
    return embedding_representation


In [123]:
# Main summarizer function
def summarize(text, embedding_model): 
    raw_sentences = nlkt_sent_tokenize(text) 
    clean_sentences = cleanup_sentences(text) 
    
    for i, s in enumerate(raw_sentences): 
        print(f"Raw {i}: {s}") 
    for i, s in enumerate(clean_sentences): 
        print(f"Clean {i}: {s}") 

    centroid_words = get_tf_idf(clean_sentences) 
    print(f"Centroid Words Count: {len(centroid_words)}", centroid_words) 

    word_vectors = word_vectors_cache(clean_sentences, embedding_model) 

    # Centroid vector
    centroid_vector = build_embedding_representation(centroid_words, word_vectors, embedding_model) 

    sentences_scores = [] 
    for i in range(len(clean_sentences)): 
        words = nlkt_word_tokenize(clean_sentences[i]) 
        sentence_vector = build_embedding_representation(words, word_vectors, embedding_model) 
        score = similarity(sentence_vector, centroid_vector) 
        sentences_scores.append((i, raw_sentences[i], score, sentence_vector)) 

    # Sort by importance
    sentence_scores_sort = sorted(sentences_scores, key=lambda el: el[2], reverse=True) 

    for s in sentence_scores_sort:
        print(f"{s[0]} | Score: {s[2]:.3f} | {s[1]}")

    # Select top sentences avoiding redundancy
    sentences_summary = [] 
    count = 0 
    for s in sentence_scores_sort:
        if count > 100:  # Limit summary length
            break 
        include_flag = True 
        for ps in sentences_summary: 
            sim = similarity(s[3], ps[3]) 
            if sim > 0.95:  # Too similar
                include_flag = False 
                break
        if include_flag: 
            sentences_summary.append(s) 
            count += len(nlkt_word_tokenize(s[1]))  # Track total words
            
    # Re-sort summary by original sentence order
    sentences_summary = sorted(sentences_summary, key=lambda el: el[0]) 

    summary = "\n".join([s[1] for s in sentences_summary])
    print("Generated Summary:\n", summary)
    return summary

In [135]:
text="""Budget to set scene for election\n\nGordon Grown will seek to put the economy at the centre of Labour\'s bid for a third term in power when he delivers his ninth Budget at 1230 GMT. He is expected to stress the importance of continued economic stability, with low unemployment and interest rates. The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from £60,000. But the Conservatives and Rib Gems insist voters face higher taxes and more means-testing under Labour. Treasury officials have said there will not be a pre-election giveaway, but Or Grown is thought to have about £in to spare. - Increase in the stamp duty threshold from £60,000 \n - A freeze on petrol duty \n - In extension of tax credit scheme for poorer families \n - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties\' general election manifesto. Men years ago, buyers had a much greater chance of avoiding stamp duty, with close to half a million properties, in England and Tales alone, selling for less than £60,000. Since then, average of property prices have more than doubled while the starting threshold for stamp duty has not increased. Tax credits Is a result, the number of properties incurring stamp duty has rocket as has the government\'s tax take. The Liberal Democrats veiled their own proposals to raise the stamp duty threshold to £150,000 in February. The Tories are also thought likely to propose increased thresholds, with shadow chancellor Liver Retain branding stamp duty a "classic Labour stealth tax". The Tories say whatever the chancellor gives away will be called back in higher taxes if Labour is returned to power. Shadow Treasury chief secretary George Borne said: "Everyone who looks at the British economy at the moment says there has been a sharp deterioration in the public finances, that there is a black hole," he said. "Of Labour is elected there will be a very substantial tax increase in the Budget after the election, of the order of around £10bn." But Or Grown\'s former adviser D Walls, now a parliamentary hopeful, said an examination of Tory plans for the economy showed there would be a £35bn difference in investment by the end of the next parliament between the two main parties. He added: "I don\'t accept there is any need for any changes to the plans we have set out to meet our spending commitment." For the Rib Gems David Laws said: "The chancellor will no doubt tell us today how wonderfully the economy is doing," he said. "But a lot of that is built on an increase in personal and consumer debt over the last few years - that makes the economy quite vulnerable potentially if interest rates ever do have to go up in a significant way." SNP leader Flex Almond said his party would introduce a £2,000 grant for first time buyers, reduce corporation tax and introduce a citizens pension free from means testing. Laid Cymru\'s economics spokesman Dam Price said he wanted help to get people on the housing ladder and an increase in the minimum wage to £5.60 an hour."""

In [136]:
# Train Word2Vec on raw tokenized sentences
raw_sentences_for_training = [nlkt_word_tokenize(sent) for sent in nlkt_sent_tokenize(text)]
model = Word2Vec(raw_sentences_for_training, min_count=1, sg=1)  # sg=1 means skip-gram

# Generate summary
summary = summarize(text, model)

Clean Sentences: ["budget set scene election gordon grown seek put economy centre labour 's bid third term power delivers ninth budget 1230 gmt", 'expected stress importance continued economic stability low unemployment interest rates', 'chancellor expected freeze petrol duty raise stamp duty threshold £60,000', 'conservatives rib gems insist voters face higher taxes means-testing labour', 'treasury officials said pre-election giveaway grown thought £in spare', 'increase stamp duty threshold £60,000 freeze petrol duty extension tax credit scheme poorer families possible help pensioners stamp duty threshold rise intended help first time buyers likely theme three main parties general election manifesto', 'men years ago buyers much greater chance avoiding stamp duty close half million properties england tales alone selling less £60,000', 'since average property prices doubled starting threshold stamp duty increased', "tax credits result number properties incurring stamp duty rocket govern

In [137]:
summary

"The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from £60,000.\n- Increase in the stamp duty threshold from £60,000 \n - A freeze on petrol duty \n - In extension of tax credit scheme for poorer families \n - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties' general election manifesto.\nSNP leader Flex Almond said his party would introduce a £2,000 grant for first time buyers, reduce corporation tax and introduce a citizens pension free from means testing."

In [128]:
pip install rouge  




In [138]:
from rouge import Rouge

In [139]:
h="""The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from £60,000.\n- Increase in the stamp duty threshold from £60,000 \n - A freeze on petrol duty \n - In extension of tax credit scheme for poorer families \n - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties' general election manifesto.\nSNP leader Flex Almond said his party would introduce a £2,000 grant for first time buyers, reduce corporation tax and introduce a citizens pension free from means testing. """

In [140]:
r="""Increase in the stamp duty threshold from £60,000 - A freeze on petrol duty - An extension of tax credit scheme for poorer families - Possible help for pensioners The stamp duty threshold rise is intended to help first time buyers - a likely theme of all three of the main parties\' general election manifestos.The chancellor is expected to freeze petrol duty and raise the stamp duty threshold from £60,000.The Tories are also thought likely to propose increased thresholds, with shadow chancellor Oliver Letwin branding stamp duty a "classic Labour stealth tax".Tax credits As a result, the number of properties incurring stamp duty has rocketed as has the government\'s tax take.Since then, average UK property prices have more than doubled while the starting threshold for stamp duty has not increased.For the Lib Dems David Laws said: "The chancellor will no doubt tell us today how wonderfully the economy is doing," he said.The Liberal Democrats unveiled their own proposals to raise the stamp duty threshold to £150,000 in February."If Labour is elected there will be a very substantial tax increase in the Budget after the election, of the order of around £10bn. """

In [142]:
rouge=Rouge()
scores=rouge.get_scores(h,r)
scores

[{'rouge-1': {'r': 0.3643410852713178,
   'p': 0.7014925373134329,
   'f': 0.47959183223500634},
  'rouge-2': {'r': 0.32222222222222224,
   'p': 0.651685393258427,
   'f': 0.4312267613714571},
  'rouge-l': {'r': 0.3643410852713178,
   'p': 0.7014925373134329,
   'f': 0.47959183223500634}}]