In [11]:
import nltk 
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords  
from nltk.stem import WordNetLemmatizer  
import numpy as np 
from langchain.embeddings import HuggingFaceEmbeddings 
from sklearn.metrics.pairwise import cosine_similarity  
import re

nltk.download('punkt')  
nltk.download('stopwords') 
nltk.download('wordnet') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shivani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shivani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shivani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
def preprocess_text(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    
    stop_words = set(stopwords.words('english'))
    important_words = {"is", "as", "it", "are", "has", "was", "were", "be", "in", "to", "the", "a", "an", "that", "which", "by", "from", "for"}
    filtered_stop_words = stop_words - important_words 
    
    lemmatizer = WordNetLemmatizer() 
    
    for sentence in sentences:
        words = word_tokenize(clean_sentence)
        words = [lemmatizer.lemmatize(word) if word.lower() not in important_words else word for word in words]
        filtered_words = [word for word in words if word.lower() not in filtered_stop_words]
        cleaned_sentences.append(' '.join(filtered_words))
        
    return cleaned_sentences, sentences  

In [13]:
def rank_sentences(text, processed_sentences, original_sentences, model_name="sentence-transformers/all-mpnet-base-v2"):
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    text_embedding = embeddings.embed_query(text)  # Embed the entire text
    sentence_embeddings = embeddings.embed_documents(processed_sentences)  # Embed each processed sentence
    
    position_weights = [
        1.2 if i == 0 or i == len(processed_sentences) - 1 else  
        1.1 if i == 1 or i == len(processed_sentences) - 2 else  
        1.0 for i in range(len(processed_sentences))  
    ]
    
    sentence_scores = []  
    for i, sentence_embedding in enumerate(sentence_embeddings):
        similarity = np.dot(text_embedding, sentence_embedding) / (np.linalg.norm(text_embedding) * np.linalg.norm(sentence_embedding))
        weighted_score = similarity * position_weights[i]
        sentence_scores.append(weighted_score) 
    
    ranked_indices = sorted(range(len(sentence_scores)), key=lambda i: sentence_scores[i], reverse=True)
    ranked_sentences = [original_sentences[i] for i in ranked_indices]  
    
    return ranked_sentences  


In [14]:
def retrieve_top_sentences(text, top_k=5, ensure_diversity=True, diversity_threshold=0.7):
    processed_sentences, original_sentences = preprocess_text(text) 
    
    if top_k > len(processed_sentences):
        top_k = len(processed_sentences)  
    
    ranked_sentences = rank_sentences(text, processed_sentences, original_sentences)  
    
    if ensure_diversity:
        summary_sentences = []
        used_embeddings = []
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        
        for sentence in ranked_sentences:
            sentence_embedding = embeddings.embed_query(sentence)  
    
            is_redundant = any(np.dot(sentence_embedding, used_embedding) / (np.linalg.norm(sentence_embedding) * np.linalg.norm(used_embedding)) > diversity_threshold for used_embedding in used_embeddings)
            
            if not is_redundant:
                summary_sentences.append(sentence)  
                used_embeddings.append(sentence_embedding) 
                if len(summary_sentences) >= top_k:
                    break  
    else:
        summary_sentences = ranked_sentences[:top_k]  
    
    return summary_sentences 

In [18]:
def generate_summary(text, summary_length=5, ensure_diversity=True, diversity_threshold=0.7):
    if summary_length <= 0:
        raise ValueError("summary_length must be a positive integer.")
    
    if not text.strip():
        return ""  
    
    summary_sentences = retrieve_top_sentences(text, top_k=summary_length, ensure_diversity=ensure_diversity, diversity_threshold=diversity_threshold)  
    
    original_sentences = sent_tokenize(text)  
    ordered_summary = []
    
    for orig_sent in original_sentences:
        clean_orig = re.sub(r'[^a-zA-Z\s]', '', orig_sent)  
        if any(re.sub(r'[^a-zA-Z\s]', '', sum_sent) == clean_orig for sum_sent in summary_sentences):
            ordered_summary.append(orig_sent)  
    
    if not ordered_summary:
        ordered_summary = summary_sentences 
    summary = " ".join(ordered_summary) 
    
    return summary  

In [19]:
def calculate_accuracy(generated_summary, reference_summary, model_name="sentence-transformers/all-mpnet-base-v2"):
    embeddings = HuggingFaceEmbeddings(model_name=model_name)  
    generated_embedding = embeddings.embed_query(generated_summary) 
    reference_embedding = embeddings.embed_query(reference_summary) 
    
    similarity = cosine_similarity([generated_embedding], [reference_embedding])  
    
    return similarity[0][0]  

In [20]:
text = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Colloquially, the term "artificial intelligence" is often used to describe machines that mimic cognitive functions that humans associate with the human mind, such as learning and problem-solving. As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect. A quip in Tesler's Theorem says "AI is whatever hasn't been done yet." For instance, optical character recognition is frequently excluded from things considered to be AI, having become a routine technology.
"""

summary_length = 5  
summary = generate_summary(text, summary_length=summary_length, ensure_diversity=True, diversity_threshold=0.7)
print("Summary:\n", summary)

reference_summary = "AI demonstrates intelligence by machines, contrasting human intelligence. It is defined as the study of intelligent agents that maximize their chances of achieving goals. The term is often used to describe machines mimicking cognitive functions like learning and problem-solving. As machines improve, tasks requiring intelligence are often redefined, known as the AI effect. Optical character recognition is an example of a technology that has become routine and is no longer considered AI."
accuracy = calculate_accuracy(summary, reference_summary)
print("Accuracy (Cosine Similarity):", accuracy)

NameError: name 'clean_sentence' is not defined