# MSADS509 Final Project M4 UE Wang

## Importing Libraries

In [1]:
import datetime
import random
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import defaultdict
import pandas as pd

## Scraping Political Data from CNN and Fox News

In [2]:

current_year = datetime.datetime.now().year

source = {'cnn': "https://www.cnn.com/politics",
          'foxnews': "https://www.foxnews.com/politics"}

news_pages = defaultdict(list)  # Use a list to store URLs and content

for source_name, source_page in source.items():
    
    # request the page and sleep
    r = requests.get(source_page)
    
    time.sleep(5 + 10 * random.random())
    
    soup = BeautifulSoup(r.content, 'html.parser')
    
    links = soup.find_all('a', href=True)
    
    for link in links:
        
        href = link['href']
        # Convert relative URLs to absolute URLs
        full_url = urljoin(source_page, href)
        
        # Check if the link contains "/politics/" and does not contain "/gallery/"
        if "/politics/" in full_url and "/gallery/" not in full_url:
            
            # Check if it's CNN and the URL has the format 'cnn.com/{}/'
            if source_name == 'cnn' and f"cnn.com/{current_year}/" in full_url:
                
                # Fetch the news content
                content_r = requests.get(full_url)
                
                content_soup = BeautifulSoup(content_r.content, 'html.parser')
                
                content = content_soup.get_text(separator=' ', strip=True)
                
                news_pages[source_name].append({'url': full_url, 'content': content})
                
            # Check if it's FOXNEWS and the URL does not contain "/category/"
            elif source_name == 'foxnews' and "/category/" not in full_url:
                
                # Fetch the news content
                content_r = requests.get(full_url)
                
                content_soup = BeautifulSoup(content_r.content, 'html.parser')
                
                content = content_soup.get_text(separator=' ', strip=True)
                
                news_pages[source_name].append({'url': full_url, 'content': content})

# Create a DataFrame

df = pd.DataFrame([(source_name, item['url'], item['content']) for source_name, items in 
                   news_pages.items() for item in items], columns=['source', 'url', 'content'])

df = df.drop_duplicates()

df.head()


Unnamed: 0,source,url,content
0,cnn,https://www.cnn.com/2024/02/04/politics/us-dam...,US destroyed or damaged 84 of 85 targets in Ir...
2,cnn,https://www.cnn.com/2024/02/04/politics/nikki-...,Nikki Haley walks back comment that Texas can ...
3,cnn,https://www.cnn.com/2024/02/04/politics/china-...,Trump suggests he would consider a tariff upwa...
4,cnn,https://www.cnn.com/2024/02/04/politics/us-ret...,Sullivan vows ‘further action’ after US carrie...
5,cnn,https://www.cnn.com/2024/02/03/politics/strike...,"US, UK carry out series of strikes against Hou..."


## News Counts for CNN and Fox News

In [5]:
source_counts = df['source'].value_counts()

# Print the counts for each source
print("CNN rows:", source_counts.get('cnn', 0))
print("Fox News rows:", source_counts.get('foxnews', 0))

CNN rows: 47
Fox News rows: 21


## Saving Results to Local Storage

In [4]:
df.to_csv('/Users/UE/Desktop/MSADS509_news_project.csv', index=False)