In [1]:
# main.py
# This script provides the core functionality for an Article Summarization and Link Analysis Platform.
# It uses pre-trained models for summarization and established libraries for web scraping and NLP tasks.

In [2]:
# !pip install rouge-score
# !pip install yake

In [3]:
# --- 1. Installation ---
# Before running, make sure you have all the necessary libraries installed.
# You can install them using pip:
# pip install torch transformers beautifulsoup4 requests scikit-learn rouge-score yake nltk

import torch
from transformers import BartTokenizer, BartForConditionalGeneration
from bs4 import BeautifulSoup
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
import yake
import nltk

# NLTK's sentence tokenizer is needed for processing text.
# The first time you run this, it will download the necessary data.
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to C:\Users\Tarun
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Tarun
[nltk_data]     Singh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# --- 2. Model and Tokenizer Initialization ---
# We use a pre-trained BART model from Hugging Face, which is excellent for summarization.
# This avoids the need for training a model from scratch, which is very resource-intensive.
def initialize_model():
    """
    Loads and initializes the BART model and tokenizer from Hugging Face.
    """
    print("Initializing the summarization model...")
    model_name = 'facebook/bart-large-cnn'
    # The tokenizer prepares the text for the model.
    tokenizer = BartTokenizer.from_pretrained(model_name)
    # The model itself.
    model = BartForConditionalGeneration.from_pretrained(model_name)
    print("Model initialized successfully.")
    return tokenizer, model

In [5]:
# --- 3. Article Scraping ---
def get_article_text(url):
    """
    Fetches and extracts the main text content from a given news article URL.
    It focuses on extracting text from <p> (paragraph) tags.
    """
    print(f"Fetching article from: {url}")
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all paragraph tags and join their text.
        # This is a simple approach and might need adjustment for different website structures.
        paragraphs = soup.find_all('p')
        article_text = ' '.join([p.get_text() for p in paragraphs])

        if not article_text:
            print("Warning: No text could be extracted from <p> tags. The page might be structured differently.")
            return None

        print("Article text extracted successfully.")
        return article_text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return None

In [6]:
# --- 4. Text Summarization ---
def summarize_text(tokenizer, model, text, max_summary_length=150, min_summary_length=50):
    """
    Generates a summary for the given text using the initialized BART model.
    """
    if not text:
        return "No text provided for summarization."

    print("Generating summary...")
    # Prepare the text for BART. The tokenizer converts the text string into tensor format.
    inputs = tokenizer([text], max_length=1024, return_tensors='pt', truncation=True)

    # Generate the summary. The model predicts the most likely sequence of tokens.
    summary_ids = model.generate(
        inputs['input_ids'],
        num_beams=4, # `num_beams` > 1 uses beam search for higher quality output
        max_length=max_summary_length,
        min_length=min_summary_length,
        early_stopping=True
    )

    # Decode the generated token IDs back into a human-readable string.
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("Summary generated.")
    return summary

In [7]:
# --- 5. Keyword Extraction ---
def extract_keywords(text):
    """
    Extracts the most relevant keywords from the text using the YAKE algorithm.
    """
    if not text:
        return []

    print("Extracting keywords...")
    # Initialize the YAKE keyword extractor.
    # You can customize language, n-gram size, etc.
    kw_extractor = yake.KeywordExtractor(top=10, n=2) # Extract top 10 keywords, up to 2-grams
    keywords = kw_extractor.extract_keywords(text)

    # Return just the keyword text, not the scores.
    keyword_list = [kw for kw, score in keywords]
    print(f"Keywords found: {keyword_list}")
    return keyword_list

In [8]:
# --- 6. Finding Related Articles ---
def find_related_articles(target_article_text, articles_db):
    """
    Finds the most similar article to the target article from a database of articles.
    This uses TF-IDF to vectorize the text and cosine similarity to measure relatedness.
    """
    if not target_article_text:
        return None, None

    print("Finding related articles...")
    # Create a list of all article texts, with the target article at the end.
    all_texts = [article['text'] for article in articles_db]
    all_texts.append(target_article_text)

    # Initialize the TF-IDF Vectorizer. This will convert text into numerical vectors.
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_texts)

    # Calculate the cosine similarity between the target article (last one) and all others.
    cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

    # Find the index of the most similar article.
    most_similar_article_index = cosine_similarities.argmax()
    highest_similarity_score = cosine_similarities[most_similar_article_index]

    # Get the most similar article's title from our mock database.
    related_article = articles_db[most_similar_article_index]

    print(f"Most related article found: '{related_article['title']}' with a similarity score of {highest_similarity_score:.2f}")
    return related_article, highest_similarity_score

In [9]:
# --- 7. ROUGE Score Evaluation ---
def evaluate_summary(generated_summary, reference_summary):
    """
    Calculates ROUGE scores to evaluate the quality of the generated summary
    against a human-written reference summary.
    """
    print("\n--- ROUGE Evaluation ---")
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)

    print(f"Generated Summary: {generated_summary}")
    print(f"Reference Summary: {reference_summary}")
    print("\nScores:")
    for key, score in scores.items():
        print(f"  {key}: Precision={score.precision:.4f}, Recall={score.recall:.4f}, F-measure={score.fmeasure:.4f}")
    print("------------------------")

In [None]:
# --- Main Execution ---
if __name__ == "__main__":
    # Initialize the model and tokenizer (this only happens once).
    tokenizer, model = initialize_model()

    # --- Mock Database of Articles for "Related Articles" functionality ---
    # In a real application, this would be a large database or search index.
    articles_database = [
        {
            "title": "Global Markets Rally on Tech Sector Growth",
            "text": "Stock markets around the world saw a significant surge today, largely driven by strong quarterly earnings reports from major technology companies. The tech-heavy NASDAQ composite index led the gains, closing up 3%. Investors are optimistic about the future of innovation and digital transformation."
        },
        {
            "title": "New Study Reveals Benefits of a Four-Day Work Week",
            "text": "A landmark study involving 50 companies has found that a four-day work week leads to increased productivity, higher employee satisfaction, and reduced burnout. Many participating companies have decided to make the policy permanent after seeing positive results in both performance and well-being."
        },
        {
            "title": "Breakthrough in AI-Powered Drug Discovery",
            "text": "Researchers have developed a new artificial intelligence system that can predict the structure of proteins with unprecedented accuracy. This breakthrough is expected to dramatically accelerate the process of drug discovery and development, potentially leading to new treatments for a wide range of diseases."
        }
    ]

    # --- Example Usage ---
    # The user can choose to enter a URL or paste article text.
    print("Choose input method:")
    print("1. Enter a URL to fetch the article")
    print("2. Paste article text manually")
    choice = input("Enter \"1\" (for Text)\n or \"2\" (for URL): ").strip()

    article_text_to_process = None

    if choice == '1':
        article_text_to_process = input("Please paste your article text here and press Enter (or Shift+Enter):\n")
        
    elif choice == '2':
        url = input("Please enter the article URL: ").strip()
        article_text_to_process = get_article_text(url)
        if not article_text_to_process:
            print("Failed to fetch or extract article text from the URL.")
            exit()
    else:
        print("Invalid choice. Exiting.")
        exit()

    print(f"\n--- Processing Article ---")
    print(f"Article Text: '{article_text_to_process[:500]}...'")

    # 1. Generate the summary
    generated_summary = summarize_text(tokenizer, model, article_text_to_process)

    # 2. Extract keywords
    keywords = extract_keywords(article_text_to_process)

    # 3. Find related articles from our database
    related_article, score = find_related_articles(article_text_to_process, articles_database)

    # --- Display Results ---
    print("\n\n=============================================")
    print("          ANALYSIS COMPLETE")
    print("=============================================")
    print("\n✅ GENERATED SUMMARY:")
    print(generated_summary)
    print("\n✅ EXTRACTED KEYWORDS:")
    print(", ".join(keywords))
    if related_article:
        print(f"\n✅ MOST RELATED ARTICLE (from our database):")
        print(f"   Title: {related_article['title']} (Similarity Score: {score:.2f})")
    print("=============================================\n")

    # --- Example of ROUGE Evaluation ---
    # To evaluate, you need a "gold standard" or reference summary.
    # Let's create one for our example article.
    reference_summary_for_evaluation = "A new AI model can analyze medical images more accurately than human radiologists, helping doctors diagnose diseases earlier. This technology, trained on millions of images, is expected to improve patient care and accelerate drug development."
    evaluate_summary(generated_summary, reference_summary_for_evaluation)

Initializing the summarization model...




Model initialized successfully.
Choose input method:
1. Enter a URL to fetch the article
2. Paste article text manually
Fetching article from: https://www.bbc.com/news/articles/ckgj7jxkq58o
Fetching article from: https://www.bbc.com/news/articles/ckgj7jxkq58o
Article text extracted successfully.

--- Processing Article ---
Article Text: 'A US appeals court has ruled that most tariffs issued by US President Donald Trump are illegal, setting up a potential legal showdown that could upend his foreign policy agenda. The ruling affects Trump's so-called "reciprocal" tariffs, imposed on most countries around the world, as well as other tariffs slapped on China, Mexico and Canada. In a 7-4 decision, the US Court of Appeals for the Federal Circuit rejected Trump's argument that the tariffs were permitted under an emergency economic powe...'
Generating summary...
Article text extracted successfully.

--- Processing Article ---
Article Text: 'A US appeals court has ruled that most tariffs issued

In [14]:
# Save locally
save_directory = "./saved_bart_model"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

print(f"✅ Model saved to {save_directory}")

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


✅ Model saved to ./saved_bart_model
