In [None]:
import requests
import string
import re 
import spacy
import yake
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup
from newspaper import Article
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import  LsaSummarizer
from sumy.utils import get_stop_words
from sklearn.feature_extraction.text import  TfidfVectorizer
from gtts import gTTS
from googletrans import Translator
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\surya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\surya\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
def summarizer(paragraph, sentence_count=3):
    parser = PlaintextParser.from_string(paragraph,Tokenizer('english'))

    lsa_summarizer = LsaSummarizer(stemmer=Stemmer('english'))
    lsa_summarizer.stop_words = get_stop_words('english')

    summary = lsa_summarizer(parser.document,sentence_count)

    return ' '.join([str(sentence) for sentence in summary])

In [4]:
def search_bbc_direct(company_name):
    """Query BBC's search page and extract news article URLs with exact phrase matching."""
    
    # BBC Search URLs (UK & Global)
    search_urls = [
        f'https://www.bbc.co.uk/search?q="{company_name}"',
        f'https://www.bbc.com/search?q="{company_name}"'
    ]

    headers = {"User-Agent": "Mozilla/5.0"}
    articles = set()  # Use a set to store unique URLs

    for url in search_urls:
        response = requests.get(url, headers=headers)
        
        # Check if request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve data from: {url}")
            continue  # Move to the next URL if this one fails

        soup = BeautifulSoup(response.text, "html.parser")

        for link in soup.find_all("a", href=True):
            href = link["href"]
            if not href.startswith("http"):
                href = "https://www.bbc.com" + href
            
            # Extract only BBC News URLs containing "articles" or "business"
            if href.split('/')[-1].isalnum() and  ("articles" in href or "business" in href):
                articles.add(href)


        if len(articles) >= 10:  # Stop if we already have 10 articles
            break

    return list(articles)  # Return up to 10 unique links

In [5]:
def scrape_article(url):
    """Extracts full text from a news article URL"""
    article = Article(url)
    article.download()
    article.parse()
    return {
        "title": article.title,
        "content": article.text,
        "summary": summarizer(article.text),
        'url':url,
        'sentiment': sentiment_analysis(article.text),
        'topics': extract_topics(article.text)
    }

In [6]:
def sentiment_analysis(text):
    """Analyzes the sentiment of a given text."""
    sid = SentimentIntensityAnalyzer()
    sentiment_scores = sid.polarity_scores(text)
    if sentiment_scores["compound"] >= 0.05:
        sentiment = "Positive"
    elif sentiment_scores["compound"] <= -0.05:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"
    return sentiment
    

In [7]:
def preprocess(text):
    """Preprocess text: remove HTML tags, emojis, stopwords, and lemmatize."""
    
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove emojis using regex
    text = re.sub(r'[^\w\s,]', '', text)  # Keeps only words, spaces, and commas
    
    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)


In [16]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_topics(text, top_n=5):
    """Extracts key topics from news content using Named Entity Recognition (NER), Keyword Extraction (YAKE), and TF-IDF."""
    
    # Extract Named Entities using spaCy
    preprocess_text = preprocess(text)
    doc = nlp(preprocess_text)
    named_entities = [ent.text for ent in doc.ents if ent.label_ in ["ORG", "PERSON", "GPE", "EVENT"]]

    # Extract Keywords using YAKE (Now with Multi-word Phrases)
    kw_extractor = yake.KeywordExtractor(n=2, top=5)  # Extract top 5 multi-word keywords
    yake_keywords = [kw[0] for kw in kw_extractor.extract_keywords(preprocess_text)]

    # Extract Keywords using TF-IDF (For Deeper Context)
    def extract_tfidf_keywords(text, num_keywords=5):
        vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2))  # Allow unigrams and bigrams
        tfidf_matrix = vectorizer.fit_transform([text])
        feature_names = vectorizer.get_feature_names_out()
        scores = tfidf_matrix.toarray()[0]
        tfidf_keywords = sorted(zip(feature_names, scores), key=lambda x: x[1], reverse=True)
        return [kw[0] for kw in tfidf_keywords[:num_keywords]]

    tfidf_keywords = extract_tfidf_keywords(preprocess_text)

    # Clean & Merge Topics
    all_topics = set(named_entities + yake_keywords + tfidf_keywords)
    all_topics = [topic for topic in all_topics if topic.lower() not in string.punctuation]  # Remove punctuation

    return all_topics[:top_n]  # Return top N topics



In [9]:
def compare_coverage(sentiment_analysis):
    """Finds key differences in sentiment distribution and topic overlap."""
    
    positive_topics = set()
    negative_topics = set()

    for article in sentiment_analysis:
        if article["sentiment"] == "Positive":
            positive_topics.update(article["topics"])
        elif article["sentiment"] == "Negative":
            negative_topics.update(article["topics"])

    # Find common & unique topics
    common_topics = positive_topics.intersection(negative_topics)
    unique_positive = positive_topics - negative_topics
    unique_negative = negative_topics - positive_topics

    return {
        "common_topics": list(common_topics),
        "unique_positive_topics": list(unique_positive),
        
        "unique_negative_topics": list(unique_negative)
    }


In [10]:
def translate_text(text):
    """Translates text to English using Google Translate."""
    translator = Translator()
    translation = translator.translate(text, dest="hi",src="en")
    return translation.text

In [11]:
def text_to_speech(summary, filename="output.mp3"):
    """Convert the summary to Hindi speech and save as an audio file."""

    text = translate_text(summary)
    tts = gTTS(text=text, lang='hi')
    tts.save(filename)
    return filename



In [None]:
def generate_report_summary(sentiment_counts, topic_comparison):
    """Generates a natural language summary of sentiment trends and coverage differences."""

    # Step 1: Determine Overall Sentiment Trend
    total_articles = sum(sentiment_counts.values())
    most_common_sentiment = max(sentiment_counts, key=sentiment_counts.get)
    
    if sentiment_counts["Positive"] > sentiment_counts["Negative"]:
        trend_summary = f"The majority of news articles are positive, highlighting good news for the company. "
    elif sentiment_counts["Negative"] > sentiment_counts["Positive"]:
        trend_summary = f"Most news articles have a negative tone, indicating challenges faced by the company. "
    else:
        trend_summary = f"The news coverage is evenly distributed between positive and negative sentiments. "

    # Step 2: Compare Topic Coverage
    positive_topics = topic_comparison.get("unique_positive_topics", [])
    negative_topics = topic_comparison.get("unique_negative_topics", [])
    common_topics = topic_comparison.get("common_topics", [])

    topic_summary = "The key themes in the news coverage include: "
    if common_topics:
        topic_summary += f"Common topics across all articles include {', '.join(common_topics)}. "
    if positive_topics:
        topic_summary += f"Positive articles focus on {', '.join(positive_topics)}. "
    if negative_topics:
        topic_summary += f"Meanwhile, negative articles highlight concerns related to {', '.join(negative_topics)}."

    # Step 3: Generate Final Analysis
    final_summary = f"{trend_summary}{topic_summary} In summary, the company's latest news is mostly {most_common_sentiment}."

    return final_summary

In [13]:
def generate_comparative_report(articles):
    """Generates a structured comparative sentiment analysis report."""

    sentiment_counts = {"Positive": 0, "Negative": 0, "Neutral": 0}
    for article in articles:
        sentiment_counts[article["sentiment"]] += 1
    
    topic_comparison = compare_coverage(articles)

    summary = generate_summary(sentiment_counts, topic_comparison)  # Generate a natural language summary

    report = {
        "Sentiment Distribution": sentiment_counts,
        "Coverage Differences": topic_comparison,
        "Final Sentiment Analysis": summary,
        "Audio": text_to_speech(summary)  # Convert summary to Hindi speech

    }

    return report


In [14]:
def main(company_name):
    """Main function to fetch news articles, analyze sentiment, and generate a comparative report."""
    
    # Step 1: Search for news articles
    article_urls = search_bbc_direct(company_name)
    articles = [scrape_article(url) for url in article_urls]

    # Step 2: Generate a comparative report
    report = generate_comparative_report(articles)

    final_report = {
        'articles':articles,
        'report':report
    }

    return final_report

In [24]:
news = main('nvidia')

In [25]:
news

{'articles': [{'title': 'Powerful quantum computers in years not decades, says Microsoft',
   'content': 'Powerful quantum computers in years not decades, says Microsoft\n\n19 February 2025 Share Save Chris Vallance Senior Technology Reporter Share Save\n\nMicrosoft\n\nMicrosoft has unveiled a new chip called Majorana 1 that it says will enable the creation of quantum computers able to solve "meaningful, industrial-scale problems in years, not decades". It is the latest development in quantum computing - tech which uses principles of particle physics to create a new type of computer able to solve problems ordinary computers cannot. Creating quantum computers powerful enough to solve important real-world problems is very challenging - and some experts believe them to be decades away. Microsoft says this timetable can now be sped up because of the "transformative" progress it has made in developing the new chip involving a "topological conductor", based on a new material it has produced.

In [20]:
news_2 = [scrape_article(url) for url in news]

ArticleException: Article `download()` failed with No connection adapters were found for ':/articles' on URL :/articles

In [76]:
news_2

[{'title': 'No apology from councillor in Facebook comment row',
  'content': 'A councillor has refused to make a public apology for her behaviour and comments on Facebook which prompted a council to hire an independent investigator.\n\nTorbay Council member Katya Maddison said "shame on you" to the council\'s chief executive after a heated meeting in May 2024 and later wrote on Facebook that the council was "a sick institution".\n\nMaddison was ordered to apologise to other councillors and staff at a council meeting on Thursday after she was found to have breached the council\'s code of conduct.\n\nThe meeting saw Torbay Council approve a council tax increase of 4.75%.',
  'summary': 'A councillor has refused to make a public apology for her behaviour and comments on Facebook which prompted a council to hire an independent investigator. Torbay Council member Katya Maddison said "shame on you" to the council\'s chief executive after a heated meeting in May 2024 and later wrote on Faceb

In [35]:
news_2

{'title': 'Google agrees to pay $28m in racial bias lawsuit',
 'content': 'Google agrees to pay $28m in racial bias lawsuit\n\nThe settlement with thousands of people who have worked at the tech giant has received preliminary approval\n\nThe settlement has been given preliminary approval by Judge Charles Adams of the Santa Clara County Superior Court in California.\n\nThe case filed in 2021 by former Google employee, Ana Cantu, said workers from Hispanic, Latino, Native American and other backgrounds started on lower salaries and job levels than their white and Asian counterparts.\n\nThe technology giant confirmed it had "reached a resolution" but rejected the allegations made against it.\n\nGoogle has agreed to pay $28m (£21.5m) to settle a lawsuit that claimed white and Asian employees were given better pay and career opportunities than workers from other ethnic backgrounds, a law firm representing claimants says.\n\nThe case brought by Ms Cantu against Google relied on a leaked inte

In [30]:
summarized = summarizer(paragraph=news_2['content'])

In [32]:
summarized

'The case filed in 2021 by former Google employee, Ana Cantu, said workers from Hispanic, Latino, Native American and other backgrounds started on lower salaries and job levels than their white and Asian counterparts. The case brought by Ms Cantu against Google relied on a leaked internal document, which allegedly showed that employees from some ethnic backgrounds reported lower compensation for similar work. "We reached a resolution, but continue to disagree with the allegations that we treated anyone differently, and remain committed to paying, hiring, and levelling all employees fairly," a Google spokesperson told the BBC.'

In [40]:
sentiment_analysis(news_2['content'])

'Negative'

In [19]:
extract_topics(news_2[4]['content'])

NameError: name 'news_2' is not defined

In [81]:
compare_coverage(news_2)

{'common_topics': ['ACEA',
  'Save Getty',
  'Burbank',
  'BBC News Share Save',
  'Trump',
  "Elon Musk's",
  'Quilter Investors',
  'Cadillac',
  'Heartfelt'],
 'unique_positive_topics': ['Tommy Robinson',
  'Jay Nagley',
  'ground-breaking Model',
  'Prof Wells',
  'Steve Jobs',
  'cutting edge',
  'Ben Kilbey',
  'EV',
  'Tom Espiner Business',
  'china',
  'sales',
  'BYD',
  'tariffs',
  'Texas',
  "Donald Trump's",
  'Beijing'],
 'unique_negative_topics': ['Daniel Clarke-Pounder',
  'call snap',
  'ago Share',
  'recall',
  'Sergeant Rooney',
  'terrorism',
  'dealership',
  'Reuters',
  'UBS',
  'Cox Automotive',
  '000 trucks',
  'banker',
  'NHTSA',
  'car trim',
  'said',
  'Elon Musk',
  '20',
  'incident',
  'Musk',
  'letter',
  'bondi said',
  'China',
  'Belfast Tesla',
  'battle',
  'Tesla dealership',
  'central banker',
  '000',
  'musk',
  'Tesla wrote',
  'Carney',
  'belfast',
  'trim falling']}

In [97]:
generate_comparative_report(news_2)

{'Sentiment Distribution': {'Positive': 6, 'Negative': 10, 'Neutral': 0},
 'Coverage Differences': {'common_topics': ['ACEA',
   'Save Getty',
   'Burbank',
   'BBC News Share Save',
   'Trump',
   "Elon Musk's",
   'Quilter Investors',
   'Cadillac',
   'Heartfelt'],
  'unique_positive_topics': ['Tommy Robinson',
   'Jay Nagley',
   'ground-breaking Model',
   'Prof Wells',
   'Steve Jobs',
   'cutting edge',
   'Ben Kilbey',
   'EV',
   'Tom Espiner Business',
   'china',
   'sales',
   'BYD',
   'tariffs',
   'Texas',
   "Donald Trump's",
   'Beijing'],
  'unique_negative_topics': ['Daniel Clarke-Pounder',
   'call snap',
   'ago Share',
   'recall',
   'Sergeant Rooney',
   'terrorism',
   'dealership',
   'Reuters',
   'UBS',
   'Cox Automotive',
   '000 trucks',
   'banker',
   'NHTSA',
   'car trim',
   'said',
   'Elon Musk',
   '20',
   'incident',
   'Musk',
   'letter',
   'bondi said',
   'China',
   'Belfast Tesla',
   'battle',
   'Tesla dealership',
   'central banker',
