In [6]:

pip install requests pandas


Note: you may need to restart the kernel to use updated packages.


In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_marketwatch_news():
    url = "https://www.marketwatch.com/latest-news"
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.content, 'html.parser')
    
    articles_section = soup.find_all('div', {'class': 'article__content'})
    news_list = []

    for article in articles_section:
        try:
            title_tag = article.find('a')
            title = title_tag.text.strip() if title_tag else "No Title"
            url = title_tag['href'] if title_tag else None
            description = article.find('p').text.strip() if article.find('p') else "No Description"
            published_at = article.find('time')['datetime'] if article.find('time') else "No Date"
            
            news_list.append({
                'title': title,
                'description': description,
                'url': url,
                'publishedAt': published_at
            })

        except Exception as e:
            print(f"Error processing article: {e}")

    return news_list

def scrape_cnbc_news():
    url = "https://www.cnbc.com/world/?region=world"
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(response.content, 'html.parser')
    
    articles_section = soup.find_all('div', {'class': 'Card-standardBreakerCard'})
    news_list = []

    for article in articles_section:
        try:
            title_tag = article.find('a', {'class': 'Card-title'})
            title = title_tag.text.strip() if title_tag else "No Title"
            url = title_tag['href'] if title_tag else None
            description = article.find('div', {'class': 'Card-description'}).text.strip() if article.find('div', {'class': 'Card-description'}) else "No Description"
            published_at = article.find('time')['datetime'] if article.find('time') else "No Date"
            
            news_list.append({
                'title': title,
                'description': description,
                'url': url,
                'publishedAt': published_at
            })

        except Exception as e:
            print(f"Error processing article: {e}")

    return news_list

# Scrape news from multiple sources
all_news = []

print("Scraping MarketWatch news...")
all_news.extend(scrape_marketwatch_news())
time.sleep(2)

print("Scraping CNBC news...")
all_news.extend(scrape_cnbc_news())
time.sleep(2)

# Convert to DataFrame and save to CSV
if all_news:
    news_df = pd.DataFrame(all_news)
    news_df.to_csv('multiple_sources_market_news.csv', index=False)
    print("Scraping completed and data saved to 'multiple_sources_market_news.csv'.")
else:
    print("No news articles found.")

# Optional: Print the first few rows of the DataFrame to verify
if all_news:
    print(pd.DataFrame(all_news).head())


Scraping MarketWatch news...
Scraping CNBC news...
Scraping completed and data saved to 'multiple_sources_market_news.csv'.
                                               title     description  \
0  Earnings playbook: Your guide to the busiest w...  No Description   
1  China just expanded its efforts to boost consu...  No Description   
2  Apple and Microsoft are among the 'Magnificent...  No Description   
3  Stocks soar, Dow closes 650 points higher buoy...  No Description   
4  'Rate cut winners': Barclays names global stoc...  No Description   

                                                 url publishedAt  
0  https://www.cnbc.com/2024/07/28/earnings-playb...     No Date  
1  https://www.cnbc.com/2024/07/28/china-just-exp...     No Date  
2  https://www.cnbc.com/2024/07/26/most-magnifice...     No Date  
3  https://www.cnbc.com/2024/07/25/stock-market-t...     No Date  
4  https://www.cnbc.com/2024/07/29/rate-cut-winne...     No Date  
