In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sentence_transformers import SentenceTransformer, util
import numpy as np
import torch.nn.functional as F

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shivani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shivani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shivani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
MODEL_PATH = "saved_model/"
try:
    model = SentenceTransformer(MODEL_PATH)
    print("Loaded model from saved path.")
except:
    model = SentenceTransformer("all-mpnet-base-v2")
    model.save(MODEL_PATH)
    print("Downloaded and saved model.")

Loaded model from saved path.


In [15]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text.lower())
    processed_words = [word for word in words if word.isalnum() and word not in stop_words]
    return " ".join(processed_words)

In [16]:
from sentence_transformers import SentenceTransformer

def rank_sentences(text, sentences):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    text_embedding = model.encode(text)
    sentence_scores = {}
    for sentence in sentences:
        sentence_embedding = model.encode(sentence)
        similarity = cosine_similarity([text_embedding], [sentence_embedding])[0][0]
        sentence_scores[sentence] = similarity
    ranked_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    return ranked_sentences

In [18]:
def mmr(doc_embedding, sentence_embeddings, sentences, diversity=0.5):
    selected_indices = []
    remaining_indices = list(range(len(sentences)))
    while len(selected_indices) < len(sentences):
        best_score = -float('inf')
        best_idx = None
        for idx in remaining_indices:
            similarity_to_doc = cosine_similarity([doc_embedding], [sentence_embeddings[idx]])[0][0]
            if not selected_indices:
                score = similarity_to_doc
            else:
                similarities_to_selected = cosine_similarity([sentence_embeddings[idx]], 
                                                            [sentence_embeddings[i] for i in selected_indices])
                max_similarity = max(similarities_to_selected)[0]
                score = diversity * similarity_to_doc - (1 - diversity) * max_similarity
            if score > best_score:
                best_score = score
                best_idx = idx
        selected_indices.append(best_idx)
        remaining_indices.remove(best_idx)
    return [sentences[i] for i in selected_indices]

In [19]:
from transformers import BartForConditionalGeneration, BartTokenizer

def abstractive_summary(text, max_length=100):
    """
    Generate an abstractive summary using BART.
    """
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [22]:
text = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals. Colloquially, the term "artificial intelligence" is often used to describe machines that mimic cognitive functions that humans associate with the human mind, such as learning and problem-solving. As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI, a phenomenon known as the AI effect. A quip in Tesler's Theorem says "AI is whatever hasn't been done yet." For instance, optical character recognition is frequently excluded from things considered to be AI, having become a routine technology.
"""

reference_summary = "AI demonstrates intelligence by machines, contrasting human intelligence. It is defined as the study of intelligent agents that maximize their chances of achieving goals. The term is often used to describe machines mimicking cognitive functions like learning and problem-solving. As machines improve, tasks requiring intelligence are often redefined, known as the AI effect. Optical character recognition is an example of a technology that has become routine and is no longer considered AI."


In [23]:
summary_length = 3
summary = abstractive_summary(text)
print("\nGenerated Summary:\n", summary)


Generated Summary:
 Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals. Leading AI textbooks define the field as the study of "intelligent agents" As machines become increasingly capable, tasks considered to require "intelligence" are often removed from the definition of AI.


In [24]:
from rouge_score import rouge_scorer

def evaluate_summary(reference_summary, generated_summary):
    """
    Evaluate the generated summary using ROUGE scores.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

reference_summary = "AI demonstrates intelligence by machines, contrasting human intelligence..."
generated_summary = summary

scores = evaluate_summary(reference_summary, generated_summary)

print("ROUGE Scores:")
for metric, score in scores.items():
    print(f"{metric}: Precision={score.precision:.4f}, Recall={score.recall:.4f}, F-measure={score.fmeasure:.4f}")

ROUGE Scores:
rouge1: Precision=0.1633, Recall=1.0000, F-measure=0.2807
rouge2: Precision=0.0208, Recall=0.1429, F-measure=0.0364
rougeL: Precision=0.1429, Recall=0.8750, F-measure=0.2456
