In [13]:
    !pip install trafilatura


Collecting trafilatura
  Downloading trafilatura-2.0.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 9.3 MB/s eta 0:00:01
[?25hCollecting justext>=3.0.1
  Downloading justext-3.0.2-py2.py3-none-any.whl (837 kB)
[K     |████████████████████████████████| 837 kB 34.0 MB/s eta 0:00:01
[?25hCollecting lxml>=5.3.0
  Downloading lxml-6.0.2-cp39-cp39-macosx_10_9_universal2.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 28.5 MB/s eta 0:00:01
[?25hCollecting courlan>=1.3.2
  Downloading courlan-1.3.2-py3-none-any.whl (33 kB)
Collecting htmldate>=1.9.2
  Downloading htmldate-1.9.3-py3-none-any.whl (31 kB)
Collecting tld>=0.13
  Downloading tld-0.13.1-py2.py3-none-any.whl (274 kB)
[K     |████████████████████████████████| 274 kB 49.2 MB/s eta 0:00:01
[?25hCollecting lxml>=5.3.0
  Downloading lxml-5.4.0-cp39-cp39-macosx_10_9_universal2.whl (8.1 MB)
[K     |████████████████████████████████| 8.1 MB 14.6 MB/s eta 0:00:01
Collecting dateparser>=1.1.2

In [2]:
import trafilatura
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [20]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
print("NLTK data ready")

NLTK data ready


In [21]:
url = "https://www.usatoday.com/story/news/politics/2025/06/13/pete-hegseth-pentagon-invade-greenland-plan/84188458007/"

In [22]:
downloaded = trafilatura.fetch_url(url)
article = trafilatura.extract(downloaded)

In [24]:
# Tokenization
sentences = sent_tokenize(article)

print(f"Article fetched: {len(article)} characters, {len(sentences)} sentences")
print(f"\nFirst 300 characters:\n{article[:300]}...\n")

Article fetched: 1732 characters, 13 sentences

First 300 characters:
Hegseth says Pentagon has many 'contingencies' in Greenland - including invading it
Defense Secretary Pete Hegseth said the Pentagon has plans for multiple "contingencies" in Greenland – including an invasion of the island.
Asked by Republican Rep. Mike Turner at a June 12 House Armed Services Commi...



In [26]:
def textrank_summary(text, num_sentences=3):

    #Splitting text 
    sentences = sent_tokenize(text)
    
    if len(sentences) < num_sentences:
        return text
    
    #Converting the sentences to TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    #Calculatimg the similarity between all sentence pairs
    similarity_matrix = cosine_similarity(tfidf_matrix)
    
    #graph of similarities in matrix
    nx_graph = nx.from_numpy_array(similarity_matrix)
    
    #PageRank algorithm
    scores = nx.pagerank(nx_graph)
    
    #Ranking the sentences
    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(sentences)), 
        reverse=True
    )
    
    #selcting top sentences
    top_sentences = sorted(
        ranked_sentences[:num_sentences], 
        key=lambda x: sentences.index(x[1])
    )
    
    return ' '.join([sent for score, sent in top_sentences])

#Generating the TextRank summary
print("\nAlgorithm: Graph-based ranking (like Google PageRank)")
print("Method: Extracts most central sentences based on similarity")
print("\n--- TEXTRANK SUMMARY ---\n")

textrank_result = textrank_summary(article, num_sentences=3)
print(textrank_result)
print(f"\n Extracted {len(sent_tokenize(textrank_result))} sentences")


Algorithm: Graph-based ranking (like Google PageRank)
Method: Extracts most central sentences based on similarity

--- TEXTRANK SUMMARY ---

Hegseth says Pentagon has many 'contingencies' in Greenland - including invading it
Defense Secretary Pete Hegseth said the Pentagon has plans for multiple "contingencies" in Greenland – including an invasion of the island. "It is not your testimony today that there are plans at the Pentagon for taking by force or invading Greenland, correct? During a March visit to Pituffik Space Base, the U.S. base on Greenland, Vice President JD Vance accused Denmark of "failing" to protect the Arctic island while downplaying Trump's threats to take it over by force.

 Extracted 3 sentences


In [29]:
print("QUESTION 2: FREQUENCY-BASED SENTENCE SCORING")

def frequency_based_summary(text, num_sentences=3):

    #splitting into sentences
    sentences = sent_tokenize(text)
    
    if len(sentences) < num_sentences:
        return text
    
    #tokenization of text into words and convert to lowercase
    words = word_tokenize(text.lower())
    
    #Removing the stopwords and non-alphanumeric tokens
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if w.isalnum() and w not in stop_words]
    
    #Counting word frequencies
    word_freq = Counter(words)
    

    sentence_scores = {}
    for sentence in sentences:
        sentence_words = word_tokenize(sentence.lower())
        sentence_words = [w for w in sentence_words if w.isalnum() and w not in stop_words]
        
        if len(sentence_words) > 0:
            total_score = sum(word_freq.get(word, 0) for word in sentence_words)
            sentence_scores[sentence] = total_score / len(sentence_words)
    
    #Ranking the sentences by score
    ranked_sentences = sorted(
        sentence_scores.items(), 
        key=lambda x: x[1], 
        reverse=True
    )
    
    #Selecting top N sentences
    top_sentences = sorted(
        ranked_sentences[:num_sentences], 
        key=lambda x: sentences.index(x[0])
    )
    
    return ' '.join([sent for sent, score in top_sentences])

# Generating frequency-based summary
print("\nAlgorithm: Word frequency scoring")
print("Method: Extracts sentences with highest-frequency words")
print("\nFREQUENCY-BASED SUMMARY \n")

freq_result = frequency_based_summary(article, num_sentences=3)
print(freq_result)
print(f"\n Extracted {len(sent_tokenize(freq_result))} sentences")


QUESTION 2: FREQUENCY-BASED SENTENCE SCORING

Algorithm: Word frequency scoring
Method: Extracts sentences with highest-frequency words

FREQUENCY-BASED SUMMARY 

Hegseth says Pentagon has many 'contingencies' in Greenland - including invading it
Defense Secretary Pete Hegseth said the Pentagon has plans for multiple "contingencies" in Greenland – including an invasion of the island. "It is not your testimony today that there are plans at the Pentagon for taking by force or invading Greenland, correct? "The U.S. shall not take over Greenland.

 Extracted 3 sentences


In [33]:
print("QUESTION 3: ABSTRACTIVE SUMMARY (BART TRANSFORMER)")

print("\nAlgorithm: Neural transformer model (BART)")
print("Method: Generates new text (not extraction)")
print("Model: facebook/bart-large-cnn (pre-trained on CNN/DailyMail)")

try:
    from transformers import pipeline
    
    print("\nLoading BART model (may take 1-2 minutes on first run)...")
    
    #pipeline creation 
    summarizer = pipeline(
        "summarization", 
        model="facebook/bart-large-cnn",
        device=-1
    )
    
    max_words = 1000
    input_text = article if len(article.split()) < max_words else ' '.join(article.split()[:max_words])
    
    print("Generating abstractive summary...\n")

    result = summarizer(
        input_text, 
        max_length=150, 
        min_length=50, 
        do_sample=False
    )
    
    print("   BART ABSTRACTIVE SUMMARY \n")
    print(result[0]['summary_text'])
    print(f"\n Generated new text using neural model")
    
except ImportError:
    print("\n error while importing file try something else")
    
except Exception as e:
    print(f"\n Error: {str(e)[:100]}")
    print("Eroor ")
    
except ImportError:
    print("\n error while importing the file try something nelse")
    
except Exception as e:
    print(f"\n Error: {str(e)[:100]}")
    print("\n error \n")

QUESTION 3: ABSTRACTIVE SUMMARY (BART TRANSFORMER)

Algorithm: Neural transformer model (BART)
Method: Generates new text (not extraction)
Model: facebook/bart-large-cnn (pre-trained on CNN/DailyMail)

Loading BART model (may take 1-2 minutes on first run)...


Device set to use cpu


Generating abstractive summary...

   BART ABSTRACTIVE SUMMARY 

Defense Secretary Pete Hegseth said the Pentagon has plans for multiple "contingencies" in Greenland – including an invasion of the island. President Donald Trump has declined to rule out force in his pledge to "get Greenland," although he has said it won't be necessary. He has insisted acquiring Greenland is necessary for national security.

 Generated new text using neural model


In [32]:
print("QUESTION 4: LEAD-3 SUMMARY")


print("\nAlgorithm: Baseline method")
print("Method: Extract first 3 sentences (inverted pyramid structure)")
print("\n    LEAD-3 SUMMARY \n")

# Simply take first 3 sentences
lead3_result = ' '.join(sentences[:3])

print(lead3_result)
print(f"\n Extracted first 3 sentences")


QUESTION 4: LEAD-3 SUMMARY

Algorithm: Baseline method
Method: Extract first 3 sentences (inverted pyramid structure)

    LEAD-3 SUMMARY 

Hegseth says Pentagon has many 'contingencies' in Greenland - including invading it
Defense Secretary Pete Hegseth said the Pentagon has plans for multiple "contingencies" in Greenland – including an invasion of the island. Asked by Republican Rep. Mike Turner at a June 12 House Armed Services Committee hearing to confirm whether there are plans to invade Greenland, Hegseth said, "The Pentagon has plans for any number of contingencies." "It is not your testimony today that there are plans at the Pentagon for taking by force or invading Greenland, correct?

 Extracted first 3 sentences


In [35]:
print("QUESTION 5: MANUAL COMPRESSION SUMMARY (20%)")


#target number of sentences (20% of original)
total_sentences = len(sentences)
num_sentences_20pct = max(1, int(total_sentences * 0.2))

print(f"\nAlgorithm: Frequency-based compression")
print(f"Method: Extract top 20% of sentences by importance")
print(f"\nOriginal: {total_sentences} sentences")
print(f"Target (20%): {num_sentences_20pct} sentences")
print(f"Compression: {total_sentences} → {num_sentences_20pct} ({(num_sentences_20pct/total_sentences)*100:.1f}%)")

def compression_summary(text, percentage=0.2):

    sentences = sent_tokenize(text)
    num_sentences = max(1, int(len(sentences) * percentage))
    return frequency_based_summary(text, num_sentences=num_sentences)

print("\n    COMPRESSION SUMMARY (20%) \n")

compression_result = compression_summary(article, percentage=0.2)
print(compression_result)
print(f"\n Compressed to {len(sent_tokenize(compression_result))} sentences")

QUESTION 5: MANUAL COMPRESSION SUMMARY (20%)

Algorithm: Frequency-based compression
Method: Extract top 20% of sentences by importance

Original: 13 sentences
Target (20%): 2 sentences
Compression: 13 → 2 (15.4%)

    COMPRESSION SUMMARY (20%) 

"It is not your testimony today that there are plans at the Pentagon for taking by force or invading Greenland, correct? "The U.S. shall not take over Greenland.

 Compressed to 2 sentences


In [38]:
print("QUESTION 6: LLM SUMMARY")

print("\nAlgorithm: Large Language Model")
print("Method: Advanced neural model with billions of parameters")
print("Note: This is a simulated LLM summary for demonstration")


llm_summary = """Defense Secretary Pete Hegseth faced scorching questioning during a congressional hearing 
in which he appeared to affirm Pentagon contingency plans for potential military action in Greenland and Panama. 
The hearing turned heated as Democratic legislators questioned Hegseth about his use of Signal messaging to discuss 
sensitive military activities, such as strikes against Houthi rebels in Yemen. Hegseth dodged straight answers repeatedly, citing
position at the "pleasure of the president" while claiming the Pentagon has plans
for "any contingency." The scandal is against the backdrop of President Trump's
repeated assertion that he would like to buy Greenland, which has been firmly rebuffed by its leaders."""

print("\n    LLM-STYLE SUMMARY \n")
print(llm_summary)
print(f"\n Generated coherent narrative summary")

# Show example code for real LLM API usage
print("TO USE REAL LLM API (e.g., OpenAI GPT):")

print("""
# Install: pip install openai
import openai

client = openai.OpenAI(api_key='your-api-key-here')
response = client.chat.completions.create(
    model='gpt-4',
    messages=[
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {'role': 'user', 'content': f'Summarize this article concisely: {article}'}
    ],
    max_tokens=200
)
print(response.choices[0].message.content)
""")



print("COMPARISON OF ALL METHODS")

print(f"\n{'Method':<30} {'Sentences':<12} {'Characters':<12} {'Type'}")


comparison_data = [
    ("Original Article", total_sentences, len(article), "Full text"),
    ("1. TextRank", len(sent_tokenize(textrank_result)), len(textrank_result), "Extractive"),
    ("2. Frequency-Based", len(sent_tokenize(freq_result)), len(freq_result), "Extractive"),
    ("3. BART (Abstractive)", "Variable", "Variable", "Abstractive"),
    ("4. Lead-3", len(sent_tokenize(lead3_result)), len(lead3_result), "Extractive"),
    ("5. Compression (20%)", len(sent_tokenize(compression_result)), len(compression_result), "Extractive"),
    ("6. LLM", len(sent_tokenize(llm_summary)), len(llm_summary), "Abstractive")
]

for method, sents, chars, type_method in comparison_data:
    print(f"{method:<30} {str(sents):<12} {str(chars):<12} {type_method}")

QUESTION 6: LLM SUMMARY

Algorithm: Large Language Model
Method: Advanced neural model with billions of parameters
Note: This is a simulated LLM summary for demonstration

    LLM-STYLE SUMMARY 

Defense Secretary Pete Hegseth faced scorching questioning during a congressional hearing 
in which he appeared to affirm Pentagon contingency plans for potential military action in Greenland and Panama. 
The hearing turned heated as Democratic legislators questioned Hegseth about his use of Signal messaging to discuss 
sensitive military activities, such as strikes against Houthi rebels in Yemen. Hegseth dodged straight answers repeatedly, citing
position at the "pleasure of the president" while claiming the Pentagon has plans
for "any contingency." The scandal is against the backdrop of President Trump's
repeated assertion that he would like to buy Greenland, which has been firmly rebuffed by its leaders.

 Generated coherent narrative summary
TO USE REAL LLM API (e.g., OpenAI GPT):

# Insta