In [None]:
!pip install rouge_score

In [None]:
!python -m spacy download en_core_web_md

**Text Summarizer with spAcy word vector sentence embeddings + KMeans Clustering with Visualization and Evaluation Metrics(ROUGE)**

In [None]:
import re
import numpy as np
import spacy
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from rouge_score import rouge_scorer


In [None]:
#Preprocessing
custom_stop_words = {"the", "in", "on", "of", "and", "a", "an"}

def selective_lowercase(text):
    tokens = text.split()
    processed_tokens = []
    for token in tokens:
        token_core = re.sub(r'^\W+|\W+$', '', token)
        if (token_core.isupper() and len(token_core) > 1) or re.search(r'\d', token_core):
            processed_tokens.append(token)
        else:
            processed_tokens.append(token.lower())
    return " ".join(processed_tokens)

def selective_stopword_removal(text):
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token.lower() not in custom_stop_words]
    return " ".join(filtered_tokens)

def preprocess_sentence(sentence):
    sentence = selective_lowercase(sentence)
    return selective_stopword_removal(sentence)


In [None]:
# --- Sample Sentences ---
sentences = [
    "The Expanded Programme on Immunisation was launched by the WHO in 1974.",
    "Smallpox was declared eradicated in 1980 after a successful global vaccination campaign.",
    "The climate crisis is spurring disease outbreaks in vulnerable communities.",
    "The UK is considering a significant cut to its support for global vaccine programs.",
    "Polio remains endemic in just a few countries, but progress is steady."
]



In [None]:
# Preprocess sentences
preprocessed_sentences = [preprocess_sentence(sent) for sent in sentences]

# Embedding Setup
nlp = spacy.load("en_core_web_md")

def get_sentence_embedding(sentence):
    doc = nlp(sentence)
    word_vectors = [token.vector for token in doc if token.has_vector and not token.is_stop]
    return np.mean(word_vectors, axis=0) if word_vectors else doc.vector

# Get sentence embeddings
sentence_embeddings = np.array([get_sentence_embedding(sent) for sent in preprocessed_sentences])

In [None]:
# --- KMeans Summarization ---
def kmeans_summarization(original_sentences, embeddings, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    kmeans.fit(embeddings)
    cluster_centers = kmeans.cluster_centers_
    cluster_labels = kmeans.labels_

    selected_sentences = []
    for i in range(n_clusters):
        cluster_indices = [j for j, label in enumerate(cluster_labels) if label == i]
        closest_idx = min(cluster_indices, key=lambda idx: np.linalg.norm(embeddings[idx] - cluster_centers[i]))
        selected_sentences.append((closest_idx, original_sentences[closest_idx]))

    selected_sentences.sort(key=lambda x: x[0])
    return " ".join(sent for _, sent in selected_sentences), cluster_labels

# Generate summary and cluster labels
generated_summary, cluster_labels = kmeans_summarization(sentences, sentence_embeddings, n_clusters=3)



In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# --- t-SNE Visualization ---
def visualize_clusters(embeddings, cluster_labels, sentences):
    n_samples = len(embeddings)
    perplexity = min(5, n_samples - 1)  # Ensure perplexity is less than the number of samples

    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42)
    reduced_embeddings = tsne.fit_transform(embeddings)

    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(
        reduced_embeddings[:, 0], reduced_embeddings[:, 1],
        c=cluster_labels, cmap='viridis', s=150, edgecolors='black'  # Larger points, darker edges
    )
    plt.colorbar(scatter, label="Cluster ID")

    for i in range(len(embeddings)):
        plt.annotate(
            str(cluster_labels[i] + 1),  # Offset by +1
            (reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
            fontsize=14, weight="bold", color="black", backgroundcolor="white",
            xytext=(5, 5), textcoords="offset points"  # Better placement
        )

    plt.title("Sentence Clustering Visualization (t-SNE + KMeans)")
    plt.xlabel("t-SNE Dim 1")
    plt.ylabel("t-SNE Dim 2")
    plt.show()

    # --- Print cluster assignments ---
    print("\nCluster Assignments:")
    for i, sentence in enumerate(sentences):
        print(f"Cluster {cluster_labels[i] + 1}: {sentence}")

# Show visualization
visualize_clusters(sentence_embeddings, cluster_labels, sentences)


In [None]:
# --- ROUGE Comparison ---
provided_summary = (
    "Global immunization initiatives have achieved notable milestones, including the WHO’s launch "
    "of the Expanded Programme on Immunisation in 1974 and the eradication of smallpox in 1980. "
    "However, emerging challenges—such as disease outbreaks fueled by the climate crisis and potential funding cuts, "
    "as seen in the UK—highlight ongoing vulnerabilities. Meanwhile, steady progress continues against polio, even though "
    "it remains endemic in a few countries."
)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(provided_summary, generated_summary)

# --- Display Results ---
print("Generated Summary:")
print(generated_summary)
print("\nROUGE Scores:")
for key, value in scores.items():
    print(f"{key}: {value}")

**Multiple Document Summarization using a hybrid approach : LSA+TextRank(for abstractive summary generation)+Evaluation metrics (ROUGE)**

In [None]:
import numpy as np
import networkx as nx
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')

In [None]:
src_1 = [
    "The Expanded Programme on Immunisation was launched by the WHO in 1974.",
    "Smallpox was declared eradicated in 1980 after a successful global vaccination campaign.",
    "The climate crisis is spurring disease outbreaks in vulnerable communities.",
    "The UK is considering a significant cut to its support for global vaccine programs.",
    "Polio remains endemic in just a few countries, but progress is steady."
]

src_2 = [
    "Vaccination efforts have drastically reduced child mortality rates worldwide.",
    "WHO's immunization programs have saved millions of lives over the past decades.",
    "Emerging diseases demand faster vaccine development and equitable distribution.",
    "Public-private partnerships have played a crucial role in making vaccines accessible.",
    "Investment in vaccine infrastructure is key to preventing future pandemics."
]

src_3 = [
    "Low-income countries rely heavily on international vaccine initiatives for immunization.",
    "The cost of vaccine development has been offset by global funding strategies.",
    "Gavi has supported immunization programs in over 70 countries, improving access.",
    "Political instability affects vaccine distribution in conflict zones.",
    "Researchers are working on next-generation vaccines to combat evolving viruses."
]


In [None]:
# Manually written base summary
base_summary = """The WHO’s Expanded Programme on Immunisation, launched in 1974, has led to significant milestones,
including the eradication of smallpox in 1980. However, new challenges arise as the climate crisis accelerates disease
outbreaks and global vaccine funding faces potential cuts. Despite progress in reducing child mortality and combating
diseases like polio, continued efforts are crucial. Public-private partnerships and global initiatives like Gavi play
a key role in making vaccines accessible, especially in low-income nations. Meanwhile, political instability and evolving
viruses threaten distribution, emphasizing the need for next-generation vaccines and stronger infrastructure investments
to prevent future pandemics."""

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def lsa_summarization(sentences, num_sentences=3):
    if len(sentences) == 0:
        return "No summary: Input text is empty."

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)

    # Ensure the number of components doesn't exceed available sentences
    n_components = min(X.shape[0] - 1, X.shape[1], 1)  # SVD requires (n_samples > n_components)

    if n_components <= 0:
        return "Not enough data for summarization."

    svd = TruncatedSVD(n_components=n_components)
    X_reduced = svd.fit_transform(X)

    # Compute sentence scores using LSA
    scores = np.argsort(-X_reduced[:, 0])  # Sort in descending order

    # Prevent out-of-range errors
    selected_sentences = min(num_sentences, len(sentences))
    summary = [sentences[i] for i in scores[:selected_sentences]]

    return " ".join(summary)


In [None]:
lsa_summary_1 = lsa_summarization(src_1)
lsa_summary_2 = lsa_summarization(src_2)
lsa_summary_3 = lsa_summarization(src_3)

print("LSA Summary for Source 1:\n", lsa_summary_1)
print("\nLSA Summary for Source 2:\n", lsa_summary_2)
print("\nLSA Summary for Source 3:\n", lsa_summary_3)


In [None]:
!pip install summa

In [None]:
from summa import summarizer

# Function to perform TextRank summarization
def textrank_summarization(sentences, num_sentences=3):
    text = " ".join(sentences)  # Convert list of sentences to a paragraph
    summary = summarizer.summarize(text, ratio=num_sentences/len(sentences))
    return summary

# Generate TextRank summaries
textrank_summary_1 = textrank_summarization(src_1)
textrank_summary_2 = textrank_summarization(src_2)
textrank_summary_3 = textrank_summarization(src_3)

# Print the summaries
print("TextRank Summary for Source 1:\n", textrank_summary_1, "\n")
print("TextRank Summary for Source 2:\n", textrank_summary_2, "\n")
print("TextRank Summary for Source 3:\n", textrank_summary_3, "\n")


In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import networkx as nx

# LSA Summarization (already defined)
def lsa_summarization(sentences, num_sentences=3):
    if len(sentences) == 0:
        return "No summary: Input text is empty."

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)

    n_components = min(X.shape[0] - 1, X.shape[1], 1)  # Ensure valid SVD components
    if n_components <= 0:
        return "Not enough data for summarization."

    svd = TruncatedSVD(n_components=n_components)
    X_reduced = svd.fit_transform(X)

    # Rank sentences using LSA
    scores = np.argsort(-X_reduced[:, 0])

    # Select top sentences
    selected_sentences = min(num_sentences, len(sentences))
    lsa_filtered_sentences = [sentences[i] for i in scores[:selected_sentences]]

    return lsa_filtered_sentences  # Return list for next processing step

# TextRank Summarization applied on LSA-filtered sentences
def textrank_summarization(sentences, num_sentences=3):
    if not sentences:
        return "No summary: No input sentences."

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    similarity_matrix = (X * X.T).toarray()


    # Create a graph
    graph = nx.from_numpy_array(similarity_matrix)

    # Compute PageRank scores
    scores = nx.pagerank(graph)

    # Rank sentences by score
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Select top-ranked sentences
    return " ".join([s[1] for s in ranked_sentences[:num_sentences]])

# Apply LSA summarization
lsa_filtered_1 = lsa_summarization(src_1)
lsa_filtered_2 = lsa_summarization(src_2)
lsa_filtered_3 = lsa_summarization(src_3)

# The output of lsa_summarization is now a list, so no need to split
lsa_sentences_1 = lsa_filtered_1
lsa_sentences_2 = lsa_filtered_2
lsa_sentences_3 = lsa_filtered_3

# Apply TextRank summarization
hybrid_summary_1 = textrank_summarization(lsa_sentences_1)
hybrid_summary_2 = textrank_summarization(lsa_sentences_2)
hybrid_summary_3 = textrank_summarization(lsa_sentences_3)

# Print final hybrid summaries
print("Hybrid Summary for Source 1:\n", hybrid_summary_1)
print("\nHybrid Summary for Source 2:\n", hybrid_summary_2)
print("\nHybrid Summary for Source 3:\n", hybrid_summary_3)






In [None]:
!pip install rouge


In [None]:
from rouge import Rouge

def evaluate_rouge(reference_summary, generated_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, reference_summary)
    return scores

# Manually Written Base Summary (Reference)
base_summary = """The WHO’s Expanded Programme on Immunisation, launched in 1974, has led to significant milestones,
including the eradication of smallpox in 1980. However, new challenges arise as the climate crisis accelerates disease
outbreaks and global vaccine funding faces potential cuts. Despite progress in reducing child mortality and combating
diseases like polio, continued efforts are crucial. Public-private partnerships and global initiatives like Gavi play
a key role in making vaccines accessible, especially in low-income nations. Meanwhile, political instability and evolving
viruses threaten distribution, emphasizing the need for next-generation vaccines and stronger infrastructure investments
to prevent future pandemics."""

# Hybrid Summaries (Generated)
hybrid_summary = """The climate crisis is spurring disease outbreaks in vulnerable communities. The Expanded Programme on
Immunisation was launched by the WHO in 1974. The UK is considering a significant cut to its support for global vaccine
programs. Public-private partnerships have played a crucial role in making vaccines accessible. Vaccination efforts have
drastically reduced child mortality rates worldwide. Investment in vaccine infrastructure is key to preventing future pandemics.
Low-income countries rely heavily on international vaccine initiatives for immunization. Gavi has supported immunization programs
in over 70 countries, improving access. Political instability affects vaccine distribution in conflict zones."""

# Compute ROUGE Scores
rouge_scores = evaluate_rouge(base_summary, hybrid_summary)

# Print Results
print("ROUGE Scores:\n", rouge_scores)


**Multiple Types Of Documents(here we have used the articles and image captions found in the article) Text Summarization using LSA+TextRank Approach**

In [None]:
import numpy as np
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import nltk

nltk.download('punkt')

In [None]:
# Input: Article + Image Captions
documents = [
    "Low-income countries rely heavily on international vaccine initiatives for immunization.",
    "The cost of vaccine development has been offset by global funding strategies.",
    "Gavi has supported immunization programs in over 70 countries, improving access.",
    "Political instability affects vaccine distribution in conflict zones.",
    "Researchers are working on next-generation vaccines to combat evolving viruses.",

    # Image Captions
    "A health worker administers a vaccine to a child in a rural clinic, illustrating Gavi's efforts to improve immunization access in low-income countries.",
    "A mother holds her child while receiving a vaccine in a conflict-affected region, highlighting the challenges of vaccine distribution in such areas.",
    "Researchers in a laboratory developing next-generation vaccines to combat evolving viruses."
]

In [None]:

sentences = nltk.sent_tokenize(" ".join(documents))

In [None]:
# Function to perform LSA + TextRank Summarization
def lsa_textrank_summarization(sentences, num_sentences=3):
    if not sentences:
        return "No summary: Input text is empty."

    num_sentences = min(num_sentences, len(sentences))  # Prevent IndexError

    # Convert sentences into TF-IDF features
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)

    # Apply LSA with a higher number of components
    svd = TruncatedSVD(n_components=min(2, X.shape[1]), n_iter=100)
    svd.fit(X)
    # Rank sentences based on importance
    importance = np.argsort(svd.components_[0])
    top_sentence_indices = importance[:-(num_sentences+1):-1]

    # Ensure selected sentences do not exceed available range
    selected_sentences = [sentences[i] for i in top_sentence_indices if i < len(sentences)]

    # If not enough sentences, take additional ones based on TextRank
    if len(selected_sentences) < num_sentences:
        graph = nx.Graph()
        for i, sentence in enumerate(sentences):
            graph.add_node(i, text=sentence)
        similarity_matrix = X * X.T
        nx.set_edge_attributes(graph, {(i, j): {'weight': similarity_matrix[i, j]} for i in range(len(sentences)) for j in range(i+1, len(sentences))})
        scores = nx.pagerank(graph)
        additional_sentences = sorted(scores, key=scores.get, reverse=True)[:num_sentences - len(selected_sentences)]
        selected_sentences.extend([sentences[i] for i in additional_sentences if i not in top_sentence_indices])

    return " ".join(selected_sentences)


In [None]:
# Generate summary
summary = lsa_textrank_summarization(sentences, num_sentences=4)

# Print final summary
print("\nFinal Multi-Document Summary:\n", summary)

In [None]:
from rouge import Rouge

# Human-written reference summary (manually crafted)
reference_summary = """Low-income countries rely on international vaccine initiatives for immunization.
Gavi has played a major role in improving vaccine accessibility in over 70 countries.
Political instability challenges vaccine distribution in conflict zones.
Researchers are developing next-generation vaccines to combat evolving viruses."""

# Compute ROUGE Scores
def evaluate_summary(reference_summary, generated_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, reference_summary)
    return scores

# Evaluate the LSA + TextRank summary
rouge_scores = evaluate_summary(reference_summary, summary)

# Print Results
print("\nROUGE Scores:")
for metric, score in rouge_scores[0].items():
    print(f"{metric}: Precision={score['p']:.4f}, Recall={score['r']:.4f}, F1-score={score['f']:.4f}")


**Fine Tuning word and sentence embeddings models**


In [None]:
!pip install -q transformers sentence-transformers


In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import nltk
from nltk import sent_tokenize
from transformers import BartTokenizer, BartForConditionalGeneration
from rouge_score import rouge_scorer


In [None]:
nltk.download('punkt')

In [None]:
!pip install rouge_score

In [None]:
sample_article = '''
Parsing is the process of analyzing a sentence's grammatical structure to determine relationships between words. In NLP, parsing can be classified into syntactic parsing, which focuses on sentence structure, and semantic parsing, which extracts meaning. Constituency parsing breaks a sentence into hierarchical phrases, while dependency parsing maps direct word dependencies. Parsing is crucial in machine translation, question answering, and information extraction to ensure accurate text processing. One major challenge in parsing is handling ambiguous sentence structures, which require advanced models for disambiguation. Stemming is a process in NLP that removes prefixes and suffixes to reduce words to their root form. Common stemming algorithms, such as the Porter Stemmer and Snowball Stemmer, help standardize words like "running", "runner", and "runs" into "run". Stemming is widely used in search engines, text mining, and information retrieval to match word variations efficiently. A limitation of stemming is that it may produce incorrect root words, such as stemming "happiness" to "happi", leading to reduced accuracy. Despite its limitations, stemming is preferred in high-speed NLP applications where precision is less critical than performance. Lemmatization is a text normalization technique that reduces words to their base dictionary form while maintaining meaning. Unlike stemming, which may produce invalid words, lemmatization ensures that words like "better" are correctly mapped to "good". Lemmatization uses morphological analysis and part-of-speech tagging to determine the correct lemma for each word. This technique improves accuracy in search engines, text summarization, and document classification. Since lemmatization relies on linguistic rules, it is computationally more expensive than stemming but yields better results in context-aware NLP applications. Tokenization is the process of splitting text into smaller units called tokens, which can be words, subwords, or characters. Word-level tokenization treats each word separately, while subword tokenization (e.g., Byte Pair Encoding) helps handle unknown words effectively. Tokenization is an essential preprocessing step in language modeling, machine translation, and speech recognition. Challenges in tokenization include handling punctuation, contractions, and compound words, which can affect downstream NLP tasks. Advanced tokenization techniques, such as WordPiece and SentencePiece, improve performance in modern transformer-based models. Part-of-Speech (POS) tagging is the process of assigning grammatical categories like nouns, verbs, adjectives, and adverbs to words in a sentence. POS tagging helps in syntactic parsing, named entity recognition (NER), and sentiment analysis by identifying word roles in a sentence. Machine learning-based POS taggers outperform rule-based methods by leveraging context and large annotated datasets. Ambiguity is a challenge in POS tagging, as words like "lead" can be either a verb or a noun depending on the context. Modern POS tagging models, such as those using recurrent neural networks (RNNs) and transformers, achieve high accuracy across different languages.
'''

reference_summary = '''
Parsing in NLP involves analyzing grammatical structure to identify relationships between words, playing a vital role in tasks like translation and information extraction. Stemming reduces words to their root forms using algorithms like Porter and Snowball, improving performance in tasks such as search and text mining despite potential accuracy trade-offs. Lemmatization, unlike stemming, maps words to their dictionary forms based on linguistic rules, offering better context-aware normalization for applications like summarization and classification. Tokenization splits text into words or subwords, enabling preprocessing for models while handling challenges like punctuation and compound words. Part-of-Speech tagging assigns grammatical roles to words using advanced models like RNNs and transformers, aiding in tasks such as parsing and sentiment analysis.
'''

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
sentences = sent_tokenize(sample_article)

#  Sentence Embedding using SBERT
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sentence_embeddings = sbert_model.encode(sentences, convert_to_tensor=True)

#  Document embedding and cosine similarity for relevance
doc_embedding = sentence_embeddings.mean(dim=0)
cosine_scores = util.pytorch_cos_sim(doc_embedding, sentence_embeddings)[0]

#  Select top-k relevant sentences
top_k = 7
top_indices = np.argsort(-cosine_scores.cpu())[:top_k]
selected_sentences = [sentences[idx] for idx in sorted(top_indices)]
selected_text = " ".join(selected_sentences)

print(" Selected Relevant Sentences:\n")
print(selected_text)

In [None]:
#  Load BART model and tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

#  Tokenize input
inputs = tokenizer(selected_text, return_tensors='pt', max_length=1024, truncation=True)

#  Generate summary
summary_ids = model.generate(
    inputs['input_ids'],
    num_beams=4,
    max_length=
    150,
    min_length=60,
    length_penalty=2.0,
    early_stopping=True
)

#  Decode output
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\n Generated Abstractive Summary:\n")
print(generated_summary)


In [None]:
#  ROUGE evaluation
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference_summary, generated_summary)

print("\n ROUGE Scores:")
for key, value in scores.items():
    print(f"{key}: Precision={value.precision:.2f}, Recall={value.recall:.2f}, F1={value.fmeasure:.2f}")


In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
import torch

# Load pretrained BART
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Device config
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Concatenate the selected relevant sentences
input_text = " ".join(selected_sentences)

# Tokenize input
inputs = tokenizer([input_text], max_length=1024, return_tensors='pt', truncation=True)
inputs = {k: v.to(device) for k, v in inputs.items()}

#  Beam Search + High Temperature Generation
summary_ids = model.generate(
    inputs["input_ids"],
    num_beams=5,              # beam search width
    length_penalty=1.0,
    max_length=150,
    min_length=40,
    early_stopping=True,
    do_sample=True,          # sampling ON
    top_k=50,                # Top-k sampling
    temperature=1.6,         #  high temperature = more randomness
)

# Decode summary
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(" Generated Abstractive Summary:\n", generated_summary)


In [None]:
#  ROUGE evaluation
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference_summary, generated_summary)

print("\n ROUGE Scores:")
for key, value in scores.items():
    print(f"{key}: Precision={value.precision:.2f}, Recall={value.recall:.2f}, F1={value.fmeasure:.2f}")


**FINE TUNING WITH CUSTOM DATA**


In [None]:
# Install dependencies
!pip install transformers datasets rouge_score --quiet

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset
from rouge_score import rouge_scorer
import torch

In [None]:
# Sample Pair 1
sample_article_1 = """
Parsing is the process of analyzing a sentence's grammatical structure to determine relationships between words. In NLP, parsing can be classified into syntactic parsing, which focuses on sentence structure, and semantic parsing, which extracts meaning. Constituency parsing breaks a sentence into hierarchical phrases, while dependency parsing maps direct word dependencies. Parsing is crucial in machine translation, question answering, and information extraction to ensure accurate text processing. One major challenge in parsing is handling ambiguous sentence structures, which require advanced models for disambiguation. Stemming is a process in NLP that removes prefixes and suffixes to reduce words to their root form. Common stemming algorithms, such as the Porter Stemmer and Snowball Stemmer, help standardize words like "running", "runner", and "runs" into "run". Stemming is widely used in search engines, text mining, and information retrieval to match word variations efficiently. A limitation of stemming is that it may produce incorrect root words, such as stemming "happiness" to "happi", leading to reduced accuracy. Despite its limitations, stemming is preferred in high-speed NLP applications where precision is less critical than performance. Lemmatization is a text normalization technique that reduces words to their base dictionary form while maintaining meaning. Unlike stemming, which may produce invalid words, lemmatization ensures that words like "better" are correctly mapped to "good". Lemmatization uses morphological analysis and part-of-speech tagging to determine the correct lemma for each word. This technique improves accuracy in search engines, text summarization, and document classification. Since lemmatization relies on linguistic rules, it is computationally more expensive than stemming but yields better results in context-aware NLP applications. Tokenization is the process of splitting text into smaller units called tokens, which can be words, subwords, or characters. Word-level tokenization treats each word separately, while subword tokenization (e.g., Byte Pair Encoding) helps handle unknown words effectively. Tokenization is an essential preprocessing step in language modeling, machine translation, and speech recognition. Challenges in tokenization include handling punctuation, contractions, and compound words, which can affect downstream NLP tasks. Advanced tokenization techniques, such as WordPiece and SentencePiece, improve performance in modern transformer-based models. Part-of-Speech (POS) tagging is the process of assigning grammatical categories like nouns, verbs, adjectives, and adverbs to words in a sentence. POS tagging helps in syntactic parsing, named entity recognition (NER), and sentiment analysis by identifying word roles in a sentence. Machine learning-based POS taggers outperform rule-based methods by leveraging context and large annotated datasets. Ambiguity is a challenge in POS tagging, as words like "lead" can be either a verb or a noun depending on the context. Modern POS tagging models, such as those using recurrent neural networks (RNNs) and transformers, achieve high accuracy across different languages.
"""

reference_summary_1 = """
Parsing in NLP involves analyzing grammatical structure to identify relationships between words, playing a vital role in tasks like machine translation, question answering, and information extraction. Syntactic and semantic parsing break down sentence structure and meaning, while stemming and lemmatization normalize word forms for efficient search and text mining. Tokenization splits text into manageable units, and POS tagging assigns grammatical roles to support tasks like NER and sentiment analysis.
"""

# Sample Pair 2
sample_article_2 = """
In Natural Language Processing (NLP), parsing is a fundamental technique that dissects a sentence’s grammatical structure to reveal the relationships among words. It can be divided into syntactic parsing, which primarily addresses sentence structure, and semantic parsing, which focuses on extracting meaning. Constituency parsing organizes a sentence into nested phrases, whereas dependency parsing connects words via direct dependencies. Accurate parsing is essential for applications such as machine translation, question answering, and information extraction, although ambiguous structures require advanced disambiguation models. Stemming, which removes affixes to derive word roots, is implemented through algorithms like the Porter Stemmer and Snowball Stemmer and is vital in search and text mining. However, stemming can sometimes yield imprecise roots, for instance reducing "happiness" to "happi". To counteract this, lemmatization converts words to their base dictionary forms more accurately, though at a higher computational cost. Tokenization divides text into tokens—be they words, subwords, or characters—and is a crucial preprocessing step in language modeling and translation. Effective tokenization must manage punctuation, contractions, and compound words, with modern techniques such as WordPiece and SentencePiece addressing these challenges. Additionally, Part-of-Speech tagging assigns grammatical categories to words, thereby clarifying their roles in a sentence and enhancing tasks like sentiment analysis and NER through advanced models based on RNNs and transformers.
"""

reference_summary_2 = """
NLP parsing involves dissecting sentence structure to reveal word relationships, a critical component for translation, question answering, and information extraction. This process, which includes both syntactic and semantic parsing, is complemented by stemming and lemmatization to normalize word forms. Tokenization and POS tagging further prepare text for advanced NLP tasks by breaking it into tokens and assigning grammatical roles.
"""

# Sample Pair 3
sample_article_3 = """
Parsing is a key process in NLP that involves systematically analyzing a sentence’s grammatical structure to determine the relationships between words. It encompasses both syntactic parsing, which examines the structural organization of sentences, and semantic parsing, which aims to extract the underlying meaning. Techniques such as constituency parsing divide a sentence into hierarchical components, while dependency parsing identifies direct relationships between words. This process is vital for applications including machine translation, question answering, and information extraction, though ambiguous sentence constructions remain challenging. In addition to parsing, NLP utilizes stemming to remove prefixes and suffixes and reduce words to their roots using algorithms like the Porter and Snowball stemmers. However, stemming may sometimes yield imprecise outputs, prompting the use of lemmatization, which converts words to their canonical forms using morphological analysis and part-of-speech tagging. Tokenization, which splits text into tokens (words, subwords, or characters), is another essential preprocessing step that supports tasks such as language modeling and speech recognition. Modern tokenization methods like Byte Pair Encoding, WordPiece, and SentencePiece are designed to handle challenges like punctuation and compound words. Finally, Part-of-Speech tagging assigns grammatical categories to words, thereby aiding in the extraction of key information for downstream NLP applications.
"""

reference_summary_3 = """
Parsing in NLP involves systematically analyzing sentence structure to uncover word relationships, a process critical for translation, question answering, and information extraction. Syntactic and semantic parsing work in tandem, while stemming and lemmatization normalize word forms. Tokenization breaks text into tokens, and POS tagging assigns grammatical roles, together enabling effective language processing.
"""

# Sample Pair 4
sample_article_4 = """
In NLP, parsing plays a crucial role in understanding language by analyzing sentence structures. There are two main forms: syntactic parsing, which focuses on the grammatical structure, and semantic parsing, which seeks to extract meaning. Hierarchical techniques such as constituency parsing and dependency parsing help structure sentences for applications like machine translation and question answering. However, ambiguity in sentence construction poses significant challenges that require sophisticated disambiguation models. In addition, text normalization processes like stemming and lemmatization are employed to reduce word forms to their roots or canonical forms. While stemming is fast and useful for high-speed applications, it can sometimes produce errors; lemmatization, though more computationally intensive, offers higher accuracy. Moreover, tokenization—which splits text into words or subwords—is critical for modern language models, and Part-of-Speech tagging assigns roles to words, further aiding tasks such as sentiment analysis and information retrieval.
"""

reference_summary_4 = """
Parsing in NLP involves analyzing sentence structure through both syntactic and semantic methods to extract meaning. Techniques like constituency and dependency parsing support translation and question answering, while normalization via stemming and lemmatization ensures accurate word forms. Tokenization and POS tagging further prepare text for advanced NLP applications.
"""

# Sample Pair 5
sample_article_5 = """
In the realm of Natural Language Processing, parsing is essential for understanding text. It involves breaking down sentences into grammatical components to reveal relationships between words. Syntactic parsing examines sentence structure, whereas semantic parsing focuses on the underlying meaning. This process is bolstered by techniques such as constituency parsing and dependency parsing, which organize words into phrases and establish direct relationships, respectively. To enhance processing, NLP systems employ stemming to remove inflectional endings and lemmatization to convert words to their dictionary form. Moreover, tokenization divides text into tokens, which is vital for tasks like speech recognition and machine translation, while POS tagging assigns each word a grammatical label, aiding further analysis such as sentiment evaluation and named entity recognition.
"""

reference_summary_5 = """
Parsing in NLP is fundamental for analyzing grammatical structures and extracting meaning. It leverages both syntactic and semantic parsing, alongside normalization methods like stemming and lemmatization, as well as tokenization and POS tagging, to prepare text for advanced processing.
"""

In [None]:
data_dict = {
    "article": [
        sample_article_1,
        sample_article_2,
        sample_article_3,
        sample_article_4,
        sample_article_5
    ],
    "summary": [
        reference_summary_1,
        reference_summary_2,
        reference_summary_3,
        reference_summary_4,
        reference_summary_5
    ]
}

# Convert the dictionary to a Hugging Face Dataset
dataset = Dataset.from_dict(data_dict)


In [None]:
def preprocess(batch):
    model_inputs = tokenizer(batch["article"], max_length=1024, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(batch["summary"], max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir="./bart-finetuned-custom",
    num_train_epochs=10,  # Increase epochs for better learning
    per_device_train_batch_size=1,
    learning_rate=3e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=1,
    eval_strategy="no",   # For demonstration, we disable evaluation here
    save_strategy="no",
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


In [None]:
def post_process(summary_text):
    # Remove duplicate sentences using NLTK's sentence tokenizer.
    import nltk
    nltk.download("punkt", quiet=True)
    from nltk.tokenize import sent_tokenize
    sentences = sent_tokenize(summary_text)
    seen = set()
    unique_sentences = []
    for sent in sentences:
        if sent not in seen:
            unique_sentences.append(sent)
            seen.add(sent)
    return " ".join(unique_sentences)


In [None]:
def generate_summary(article_text, use_sampling=False):
    inputs = tokenizer(article_text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    if use_sampling:
        summary_ids = model.generate(
            inputs["input_ids"],
            do_sample=True,
            temperature=1.0,           # High temperature for more diversity
            top_k=50,
            top_p=0.95,
            max_length=256,
            length_penalty=1.6,
            early_stopping=True
        )
    else:
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=256,
            num_beams=3,
            no_repeat_ngram_size=3,
            length_penalty=1.6,
            early_stopping=True
        )

    raw_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return post_process(raw_summary)


In [None]:
generated_summary = generate_summary(sample_article_1, use_sampling=True)
print("Generated Summary:\n", generated_summary)

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = scorer.score(reference_summary_1, generated_summary)

print("\nROUGE Scores:")
for key in rouge_scores:
    score = rouge_scores[key]
    print(f"{key}: Precision={score.precision:.2f}, Recall={score.recall:.2f}, F1={score.fmeasure:.2f}")


In [None]:
from bert_score import score as bert_score
P, R, F1 = bert_score([generated_summary], [reference_summary_1], lang="en", verbose=True)
print("\nBERTScore:")
print(f"Precision: {P.mean().item():.4f}, Recall: {R.mean().item():.4f}, F1: {F1.mean().item():.4f}")

In [None]:
!pip install bert_score

In [None]:
!pip install sentence-transformers


In [None]:
!pip install transformers datasets rouge_score matplotlib nltk



In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import sent_token
ize
from transformers import BartTokenizer, BartForConditionalGeneration

# Download NLTK data
nltk.download("punkt", quiet=True)

In [None]:
sample_article = """
Parsing is the process of analyzing a sentence's grammatical structure to determine relationships between words. In NLP, parsing can be classified into syntactic parsing, which focuses on sentence structure, and semantic parsing, which extracts meaning. Constituency parsing breaks a sentence into hierarchical phrases, while dependency parsing maps direct word dependencies. Parsing is crucial in machine translation, question answering, and information extraction to ensure accurate text processing. One major challenge in parsing is handling ambiguous sentence structures, which require advanced models for disambiguation. Stemming is a process in NLP that removes prefixes and suffixes to reduce words to their root form. Common stemming algorithms, such as the Porter Stemmer and Snowball Stemmer, help standardize words like "running", "runner", and "runs" into "run". Stemming is widely used in search engines, text mining, and information retrieval to match word variations efficiently. A limitation of stemming is that it may produce incorrect root words, such as stemming "happiness" to "happi", leading to reduced accuracy. Despite its limitations, stemming is preferred in high-speed NLP applications where precision is less critical than performance. Lemmatization is a text normalization technique that reduces words to their base dictionary form while maintaining meaning. Unlike stemming, which may produce invalid words, lemmatization ensures that words like "better" are correctly mapped to "good". Lemmatization uses morphological analysis and part-of-speech tagging to determine the correct lemma for each word. This technique improves accuracy in search engines, text summarization, and document classification. Since lemmatization relies on linguistic rules, it is computationally more expensive than stemming but yields better results in context-aware NLP applications. Tokenization is the process of splitting text into smaller units called tokens, which can be words, subwords, or characters. Word-level tokenization treats each word separately, while subword tokenization (e.g., Byte Pair Encoding) helps handle unknown words effectively. Tokenization is an essential preprocessing step in language modeling, machine translation, and speech recognition. Challenges in tokenization include handling punctuation, contractions, and compound words, which can affect downstream NLP tasks. Advanced tokenization techniques, such as WordPiece and SentencePiece, improve performance in modern transformer-based models. Part-of-Speech (POS) tagging is the process of assigning grammatical categories like nouns, verbs, adjectives, and adverbs to words in a sentence. POS tagging helps in syntactic parsing, named entity recognition (NER), and sentiment analysis by identifying word roles in a sentence. Machine learning-based POS taggers outperform rule-based methods by leveraging context and large annotated datasets. Ambiguity is a challenge in POS tagging, as words like "lead" can be either a verb or a noun depending on the context. Modern POS tagging models, such as those using recurrent neural networks (RNNs) and transformers, achieve high accuracy across different languages.
"""

reference_summary = """
Parsing in NLP involves analyzing grammatical structure to identify relationships between words, playing a vital role in tasks like translation and information extraction. Stemming reduces words to their root forms using algorithms like Porter and Snowball, improving performance in tasks such as search and text mining despite potential accuracy trade-offs. Lemmatization, unlike stemming, maps words to their dictionary forms based on linguistic rules, offering better context-aware normalization for applications like summarization and classification. Tokenization splits text into words or subwords, enabling preprocessing for models while handling challenges like punctuation and compound words. Part-of-Speech tagging assigns grammatical roles to words using advanced models like RNNs and transformers, aiding in tasks such as parsing and sentiment analysis.
"""

In [None]:
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Set the model to output attentions and hidden states
model.config.output_attentions = True
model.config.output_hidden_states = True

In [None]:
# Encoder Part
# Tokenize the sample article
inputs = tokenizer(sample_article, return_tensors='pt', max_length=1024, truncation=True)
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']

In [None]:

encoder_outputs = model.model.encoder(input_ids, attention_mask=attention_mask)
# encoder_outputs.attentions is a tuple: one element per encoder layer.
# For visualization, pick the attention matrix from the first encoder layer, head 0.
encoder_attentions = encoder_outputs.attentions[0]  # shape: (batch_size, num_heads, seq_len, seq_len)
enc_attn_matrix = encoder_attentions[0, 0].detach().cpu().numpy()  # For first sample, first head

In [None]:
# Print sample tokenized input and positional embedding samples:
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print("Tokenized Input:")
print(tokens)


In [None]:
pos_embed = model.model.encoder.embed_positions.weight.detach().cpu().numpy()
print("\nShape of Positional Embeddings:", pos_embed.shape)
print("First 10 positional embeddings (first 5 dimensions):")
for i in range(10):
    print(f"Position {i}: {pos_embed[i][:5]}")

In [None]:
!pip install bertviz

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load tokenizer and model
model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Ensure the model returns attention and hidden state outputs
model.config.output_attentions = True
model.config.output_hidden_states = True


In [None]:
encoder_input_ids = tokenizer("Parsing is the process of analyzing a sentence's grammatical structure to determine relationships between words. In NLP, parsing can be classified into syntactic parsing, which focuses on sentence structure, and semantic parsing, which extracts meaning. Constituency parsing breaks a sentence into hierarchical phrases, while dependency parsing maps direct word dependencies..", return_tensors="pt", add_special_tokens=True).input_ids
with tokenizer.as_target_tokenizer():
    decoder_input_ids = tokenizer("Parsing in NLP involves analyzing grammatical structure to identify relationships between words, playing a vital role in tasks like translation and information extraction..", return_tensors="pt", add_special_tokens=True).input_ids

outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

In [None]:
from bertviz import model_view
model_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens= encoder_text,
    decoder_tokens = decoder_text
)

In [None]:
from IPython.core.display import display, HTML
display(HTML("<script>require.config({paths: {d3: 'https://d3js.org/d3.v5.min'}});</script>"))
