In [1]:
!pip install requests bs4  pandas



In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

all_urls = []
url = 'https://www.cnn.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

for a in soup.find_all('a', href=True):   #retriving all the urls in the cnn news 
    href = a['href']
    
    full_url = urljoin(url, href)  # Using urljoin to handle both relative and absolute URLs
    if full_url not in all_urls:  # Avoid duplicate URLs
        all_urls.append(full_url)

print(all_urls)


In [7]:
#importing the required libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv
from textblob import TextBlob
import uuid

def url_is_article_cnn(url, current_year='2024'):   #function to check if the URL is a CNN article
    return url and f'cnn.com/{current_year}/' in url and '/gallery/' not in url

def url_is_article_indiatoday(url):   #function to check if the ur is a IndiaToday article
    return url and 'https://www.indiatoday.in/' in url and '/story/' in url

def extract_article_urls(page_url, is_cnn=True, current_year='2024'): #this extracts articles url from the webpage
    try:
        response = requests.get(page_url, timeout=30)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching {page_url}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    all_links = soup.find_all('a', href=True)

    article_urls = []
    for link in all_links:
        full_url = urljoin(page_url, link['href'])
        if is_cnn:
            if url_is_article_cnn(full_url, current_year) and full_url not in article_urls:
                article_urls.append(full_url)
        else:
            if url_is_article_indiatoday(full_url) and full_url not in article_urls:
                article_urls.append(full_url)
    
    return article_urls

def scrape_cnn_article(article_url):   #scraping a CNN article
    try:                                                         
        response = requests.get(article_url, timeout=30)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching {article_url}: {e}")
        return {}

    soup = BeautifulSoup(response.text, 'html.parser')  # Parse the HTML content of the page using BeautifulSoup with the built-in HTML parser

    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else 'No title found'

    summary_tag = soup.find('div', class_='article__content-container')
    if summary_tag:
        summary_text = ' '.join(p.get_text(strip=True) for p in summary_tag.find_all('p'))
    else:
        summary_meta = soup.find('meta', attrs={"name": "description"}) or soup.find('meta', attrs={"property": "og:description"})
        summary_text = summary_meta.get('content', 'No summary found') if summary_meta else 'No summary found'

    summary_sentences = summary_text.split('. ')
    summary = '. '.join(summary_sentences[:2]) + '.' if len(summary_sentences) > 1 else summary_text

    date_tag = soup.find('meta', {'property': 'article:published_time'})
    pub_date = date_tag.get('content', 'No date found') if date_tag else 'No date found'

    return {    
        'title': title,
        'summary': summary,
        'publication_date': pub_date,
        'source': 'CNN',
        'url': article_url
    }

def scrape_india_today_article(article_url):   #scraping the indiatoday articles.
    try:
        response = requests.get(article_url, timeout=30)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching {article_url}: {e}")
        return {}

    soup = BeautifulSoup(response.text, 'html.parser')

    title_tag = soup.find('h1')
    title = title_tag.get_text(strip=True) if title_tag else 'No title found'

    summary_tag = soup.find('div', class_='Story_description__fq_4S description paywall')
    if summary_tag:
        summary_text = ' '.join(p.get_text(strip=True) for p in summary_tag.find_all('p'))
    else:
        summary_meta = soup.find('meta', attrs={"name": "description"}) or soup.find('meta', attrs={"property": "og:description"})
        summary_text = summary_meta.get('content', 'No summary found') if summary_meta else 'No summary found'

    summary_sentences = summary_text.split('. ')
    summary = '. '.join(summary_sentences[:2]) + '.' if len(summary_sentences) > 1 else summary_text

    pub_date_tag = soup.find('span', class_='jsx-ace90f4eca22afc7 strydate')
    pub_date = pub_date_tag.get_text(strip=True) if pub_date_tag else 'No date found'

    return {
        'title': title,
        'summary': summary,
        'publication_date': pub_date,
        'source': 'India Today',
        'url': article_url
    }

def categorize_article(text):
    """
    Categorizes an article based on its text content using keywords and TextBlob.
    """
    text_blob = TextBlob(text.lower())
    categories = {
        'Politics': ['government', 'election', 'policy', 'parliament'],
        'Technology': ['technology', 'software', 'hardware', 'innovation'],
        'Sports': ['sports', 'tournament', 'player', 'match'],
        'Finance': ['economy', 'market', 'finance', 'stock'],
        'Health': ['health', 'medicine', 'disease', 'wellness'],
        'Science': ['science', 'research', 'discovery', 'experiment'],
        'Entertainment': ['entertainment', 'celebrity', 'movie', 'music'],
        'Environment': ['environment', 'climate', 'pollution', 'wildlife'],
        'Education': ['education', 'university', 'school', 'learning'],
        'Business': ['business', 'corporate', 'industry', 'startup'],
        'Travel': ['travel', 'tourism', 'destination', 'hotel'],
        'Food': ['food', 'cuisine', 'restaurant', 'recipe'],
        'Fashion': ['fashion', 'style', 'clothing', 'designer']
    }

    for category, keywords in categories.items():
        if any(word in text_blob.words for word in keywords):
            return category
    
    return 'Other'  #if the category is not present in the above mentioned then returns the other category

def scrape_and_save_articles(page_urls, output_csv, current_year='2024'): #Scrapes articles from the provided page URLs and saves their details to a CSV file.
    seen_urls = set()
    id_counter = 1  # Initializing ID counter

    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        fieldnames = ['id', 'title', 'summary', 'publication_date', 'source', 'url', 'category']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        for page_url in page_urls:
            is_cnn = 'cnn.com' in page_url
            article_urls = extract_article_urls(page_url, is_cnn, current_year)
            if not article_urls:
                print(f"No articles found or an error occurred while processing {page_url}.")
                continue
            
            for url in article_urls:
                if url in seen_urls:
                    continue  # Skip duplicates
                
                if 'cnn.com' in url:
                    article_info = scrape_cnn_article(url)
                elif 'indiatoday.in' in url:
                    article_info = scrape_india_today_article(url)
                else:
                    continue  # Skip URLs that are not from CNN or India Today

                if article_info:
                    # Check if publication date is valid
                    if article_info['publication_date'] == 'No date found':
                        print(f"Skipped due to missing date: {url}")
                        continue

                    article_info['category'] = categorize_article(article_info['summary'])
                    article_info['id'] = id_counter  # Assign ID
                    writer.writerow(article_info)
                    seen_urls.add(url)
                    print(f"Saved: {url}")

                    id_counter += 1  # Increment ID counter
                else:
                    print(f"Skipped: {url}")


if __name__ == "__main__":
    page_urls = [
        'https://www.cnn.com/',  
        'https://www.indiatoday.in/'  
    ]
    output_csv = 'news_articles.csv'
    scrape_and_save_articles(page_urls, output_csv)
    print(f"Finished! Articles saved to {output_csv}")

Saved: https://www.cnn.com/2024/09/03/europe/ukraine-poltava-russia-attack-intl/index.html
Saved: https://www.cnn.com/2024/09/03/middleeast/hamas-israel-hostage-deal-raise-stakes-intl/index.html
Saved: https://www.cnn.com/2024/09/03/style/china-detains-artist-gao-brothers-mao-intl-hnk/index.html
Saved: https://www.cnn.com/2024/09/03/food/honey-deuce-us-open-cocktail-popularity/index.html
Saved: https://www.cnn.com/2024/09/03/health/chair-yoga-flexibility-stress-relief-wellness/index.html
Saved: https://www.cnn.com/2024/09/03/sport/morteza-mehrzadselakjani-sleep-paralympics-spt-intl/index.html
Saved: https://www.cnn.com/2024/09/03/asia/bangladesh-house-mirrors-detention-center-intl-hnk-dst/index.html
Saved: https://www.cnn.com/2024/09/03/world/video/bangladesh-hasina-regime-coren-live-09031aseg2-cnni-world-fast
Saved: https://www.cnn.com/2024/08/12/asia/bangladesh-quota-protests-student-martyr-intl-hnk-dst/index.html
Saved: https://www.cnn.com/2024/08/07/asia/muhammad-yunus-bangladesh-g