<a href="https://colab.research.google.com/github/Sumant-crty/Python-Data-Scraping-Portfolio/blob/main/Newsbroadcast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install newspaper3k lxml_html_clean

# --- Import Libraries ---
import requests
from bs4 import BeautifulSoup
from newspaper import Article
import csv
import time
import pandas as pd # Import pandas for DataFrame operations
from datetime import datetime # Import datetime for current date
from IPython.display import HTML # Import HTML for displaying web page content

# --- Configuration ---
BASE_URL = "https://timesofindia.indiatimes.com/"
OUTPUT_FILENAME = "toi_headlines.csv"
OUTPUT_ALL_HEADLINES_FILENAME = "all_newspaper_headlines.csv"
HTML_OUTPUT_FILENAME = "headlines.html" # New: HTML output filename

# CRUCIAL: Realistic User-Agent is necessary
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

NEWSPAPER_CONFIGS = {
    'Times of India': {'url': 'https://timesofindia.indiatimes.com/', 'language': 'en'},
    'The Hindu': {'url': 'https://www.thehindu.com/', 'language': 'en'},
    'Dainik Bhaskar': {'url': 'https://www.bhaskar.com/', 'language': 'hi'}
}

def get_article_links(url, base_url):
    """Downloads the page and returns a set of unique article links."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        soup = BeautifulSoup(response.content, 'html.parser')

        article_links = set()
        excluded_keywords = [
            'javascript:', '#', '.css', '.js', '.jpg', '.png', '.gif', '.pdf',
            'videos', 'photos', 'gallery', 'e-paper', 'epaper', 'subscribe', 'newsletter',
            'login', 'signin', 'register', 'terms-of-use', 'privacy-policy', 'contact-us', 'about-us',
            'advertise', 'careers', 'sitemap', 'authors', 'topics', 'tags', 'archives'
        ]

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            full_link = ""

            # Construct full URL if relative
            if href.startswith('http') and base_url.split('//')[1].split('/')[0] in href:
                full_link = href
            elif href.startswith('/') and not href.startswith('//'): # Relative URL
                full_link = base_url.rstrip('/') + href

            # Filter out obvious non-article links and duplicates
            if full_link and not any(keyword in full_link for keyword in excluded_keywords):
                if full_link != base_url and len(full_link) > len(base_url) + 5: # Avoid just base URL and very short links
                    article_links.add(full_link)

        return article_links # Moved outside the for loop

    except requests.exceptions.RequestException as e:
        print(f"ERROR: Could not fetch {url}. Reason: {e}")
        return set()

def scrape_article_data(links, language, limit=10):
    """Scrapes the title, date, and URL for each article link, up to a specified limit."""
    scraped_data = []
    # Convert set to list and take only the first 'limit' items
    links_to_process = list(links)[:limit]


    for i, link in enumerate(links_to_process):


        try:
            # Use newspaper3k for intelligent article content extraction
            article = Article(link, language=language)
            article.download()
            article.parse()

            scraped_data.append({
                "Title": article.title,
                "Publish_Date": str(article.publish_date), # Convert datetime object to string
                "URL": link
            })

            # Introduce a short, polite delay between requests
            time.sleep(1)

        except Exception as e:
            print(f"WARNING: Failed to process article {link}. Skipping. Error: {e}")
            time.sleep(1) # Still pause if an error occurs
            continue

    return scraped_data

def save_to_csv(data, filename):
    """Saves the list of dictionaries to a CSV file."""
    if not data:
        print("No data to save.")
        return

    # Use the keys of the first dictionary as the field names (headers)
    fieldnames = data[0].keys()

    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader() # Write the header row
        writer.writerows(data) # Write all the data rows




# --- Main Execution Block ---
if __name__ == "__main__":
    all_scraped_data = []

    for newspaper_name, config in NEWSPAPER_CONFIGS.items():
        newspaper_url = config['url']
        newspaper_language = config['language']

            # 1. Get article links for the current newspaper
        article_links = get_article_links(newspaper_url, newspaper_url)

        if article_links:
            # 2. Scrape article data for the current newspaper, with language and limit
            newspaper_data = scrape_article_data(article_links, newspaper_language, limit=10)

            # 3. Add 'Source' column to each article
            for article in newspaper_data:
                article['Source'] = newspaper_name

            # 4. Accumulate all scraped data
            all_scraped_data.extend(newspaper_data)
        else:
            print(f"No article links found for {newspaper_name}.")



    # Save the combined results to a single CSV
    save_to_csv(all_scraped_data, OUTPUT_ALL_HEADLINES_FILENAME)

    # Load the combined data into a DataFrame
    df_all_headlines = pd.read_csv(OUTPUT_ALL_HEADLINES_FILENAME)

    # Display headlines in an HTML page with hyperlinks
    current_date = datetime.now().strftime('%d-%m-%Y')

    html_content_parts = [
        f"<h1>Latest News Headlines - {current_date}</h1>\n",
        "<ul>\n"
    ]

    for index, row in df_all_headlines.iterrows():
        title = row['Title']
        url = row['URL']
        html_content_parts.append(f"  <li><a href=\"{url}\" target=\"_blank\">{title}</a></li>\n")

    html_content_parts.append("</ul>")

    new_html_content = "".join(html_content_parts)
    display(HTML(new_html_content))

    # New: Save the HTML content to a file
    with open(HTML_OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
        f.write(new_html_content)

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspaper3k)
  Downlo