In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import requests
from bs4 import BeautifulSoup
import time
from transformers import pipeline, AutoTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from collections import Counter

# ------------------ BART Summarizer Setup ------------------ #
model_name = "facebook/bart-large-cnn"
summarizer = pipeline("summarization", model=model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ------------------ NLTK Sentiment Analyzer Setup ------------------ #
"""nltk.download('vader_lexicon')"""
sid = SentimentIntensityAnalyzer()

def summarize_with_bart(text):
    try:
        input_tokens = tokenizer.encode(text, return_tensors="pt")
        input_len = input_tokens.shape[1]

        if input_len < 10:
            return text.strip()

        max_len = min(100, input_len - 1)
        min_len = max(5, max_len // 2)

        summary = summarizer(
            text,
            max_length=max_len,
            min_length=min_len,
            do_sample=False
        )
        return summary[0]['summary_text']
    except Exception as e:
        print(f"⚠️ BART summarization failed: {e}")
        return "Summary not available."

# ------------------ Sentiment Function ------------------ #
def analyze_sentiment(text):
    score = sid.polarity_scores(text)
    if score['compound'] >= 0.05:
        return 'Positive'
    elif score['compound'] <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# ------------------ Selenium Setup ------------------ #
def setup_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0")
    service = Service()
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# ------------------ Reuters Scraper ------------------ #
def scrape_reuters_bs(keyword):
    url = f"https://www.reuters.com/site-search/?query={keyword}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.reuters.com/"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f"❌ Reuters HTTP error: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.select("div.search-results__item")
    results = []

    for article in articles[:50]:
        try:
            title_tag = article.find("h3")
            summary_tag = article.find("p")
            link_tag = article.find("a", href=True)
            if not title_tag or not summary_tag:
                continue
            title = title_tag.get_text(strip=True)
            summary = summary_tag.get_text(strip=True)
            link = link_tag['href']
            if link.startswith("/"):
                link = "https://www.reuters.com" + link
            results.append({"title": title, "link": link, "summary": summary})
        except Exception as e:
            print(f"⚠️ Skipped one Reuters article due to: {e}")
    return results

# ------------------ NPR Scraper with Pagination ------------------ #
def scrape_npr_selenium(keyword, max_articles):
    driver = setup_driver()
    results = []
    page = 1

    while len(results) < max_articles:
        url = f"https://www.npr.org/search?query={keyword}&page={page}"
        driver.get(url)
        time.sleep(3)
        articles = driver.find_elements(By.CSS_SELECTOR, "article.item")

        for article in articles:
            if len(results) >= max_articles:
                break
            try:
                try:
                    title_elem = article.find_element(By.CSS_SELECTOR, "h2, h3")
                    title = title_elem.text.strip()
                except:
                    continue
                try:
                    summary = article.find_element(By.CLASS_NAME, "teaser").text.strip()
                except:
                    continue

                links = article.find_elements(By.TAG_NAME, "a")
                link = ""
                for a in links:
                    href = a.get_attribute("href")
                    if href and "/" in href and "/202" in href:  # check for full article URL pattern
                        link = href
                        break
                if not link:
                    link = links[0].get_attribute("href") if links else "#"

                results.append({"title": title, "link": link, "summary": summary})
            except Exception as e:
                print(f"NPR: Skipped one article due to: {e}")
                continue

        page += 1

    driver.quit()
    return results

# ------------------ Main Execution ------------------ #
if __name__ == "__main__":
    keyword = input("Enter a keyword to search: ").strip()
    max_articles = int(input("How many total NPR articles would you like to scrape? "))
    reuters_data = scrape_reuters_bs(keyword)
    npr_data = scrape_npr_selenium(keyword, max_articles)
    all_articles = reuters_data + npr_data

    print(f"\n🔎 Articles scraped: Reuters = {len(reuters_data)}, NPR = {len(npr_data)}")

    enhanced_articles = []
    for article in all_articles:
        try:
            bart_summary = summarize_with_bart(article['summary'])
            sentiment = analyze_sentiment(article['summary'])
            enhanced_articles.append({
                'Title': article['title'],
                'Summary': bart_summary,
                'Sentiment': sentiment,
                'Read More': bart_summary,
                'Link': article['link']
            })
        except Exception as e:
            print(f"Failed to enhance article: {e}")
            continue

    print(f"\n✅ Successfully enhanced: {len(enhanced_articles)} articles")

    print("\n🔍 Bullet Point Summary:")
    for idx, article in enumerate(enhanced_articles, start=1):
        print(f"{idx}. {article['Summary']} ({article['Sentiment']})")

    print("\n📊 Sentiment Breakdown:")
    sentiment_counts = Counter([a['Sentiment'] for a in enhanced_articles])
    total = sum(sentiment_counts.values())
    for sentiment, count in sentiment_counts.items():
        print(f"- {sentiment}: {count} ({(count/total)*100:.1f}%)")

    view_more = input("\nEnter the number of the article you want to read more about (or press Enter to skip): ").strip()
    if view_more.isdigit():
        index = int(view_more) - 1
        if 0 <= index < len(enhanced_articles):
            selected = enhanced_articles[index]
            print(f"\n📝 Title: {selected['Title']}")
            print(f"📚 Full Summary: {selected['Read More']}")
            print(f"🔗 Link: {selected['Link']}")
        else:
            print("❌ Invalid article number.")
    else:
        print("⏭ Skipped reading more.")