# Selenium-Based Media Article Downloader

## Package installation and imports

In [None]:
# !pip install selenium
# !pip install requests
# !pip install beautifulsoup4

In [87]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException

import pandas as pd
import requests
from bs4 import BeautifulSoup

import time
from os.path import exists

## Initialization for Firefox based webdriver

In [None]:
service = Service(executable_path='./webdrivers/geckodriver.exe')
driver = webdriver.Firefox(service=service)
driver.get('https://google.com')

In [128]:
DOWNLOAD_DIRECTORY = './downloaded_pages'
NYT_SUBDIR = 'nyt'
FOX_SUBDIR = 'fox'

NYT_MEDIA_NAME = 'nytimes.com'
FOX_MEDIA_NAME = 'foxnews.com'


media_to_workflow_fn = {
  NYT_MEDIA_NAME: nyt_workflow,
  FOX_MEDIA_NAME: fox_workflow
}
media_to_subdir = {
  NYT_MEDIA_NAME: NYT_SUBDIR,
  FOX_MEDIA_NAME: FOX_SUBDIR,
}

## Data loading for reading articles

In [97]:
url_df = pd.read_csv('urls_to_download/mc-onlinenews-mediacloud-20250401214806-content.csv')
url_df

Unnamed: 0,id,indexed_date,language,media_name,media_url,publish_date,title,url
0,fadfc071c868ad01c365cb7de573db69d3f5d2f32e9d6f...,2025-04-01 16:24:01.274075+00:00,en,usatoday.com,usatoday.com,2025-03-31,Trump travel ban delayed as U.S. continues to ...,https://www.usatoday.com/story/news/world/2025...
1,4d283d47a0d750ac78005d03ad66f95815f1e911d3f703...,2025-04-01 11:28:36.696584+00:00,en,foxnews.com,foxnews.com,2025-03-31,Tufts University student arrest generates 10x ...,https://www.foxnews.com/media/tufts-university...
2,a297fb195e1a425f725cc5497bfb2eda707698066c1baf...,2025-04-01 09:31:02.093599+00:00,en,foxnews.com,foxnews.com,2025-03-31,Colleges in ICE's deportation crosshairs shell...,https://www.foxnews.com/us/colleges-ices-depor...
3,ada6aa55580cb8baa8e053a4a7ecd3d070e3a5295bbae6...,2025-03-31 19:26:54.691426+00:00,en,cbsnews.com,cbsnews.com,2025-03-31,Trump's crackdown on students with visas and g...,https://www.cbsnews.com/miami/news/student-pro...
4,ade288fd5cf6d4cd92321b925feca26780096311964552...,2025-03-31 19:19:47.456609+00:00,en,cbsnews.com,cbsnews.com,2025-03-31,Trump's crackdown on students with visas and g...,https://www.cbsnews.com/sanfrancisco/news/stud...
...,...,...,...,...,...,...,...,...
125,9b096f649487b786ab5a71acb57f37a14a511787513d9a...,2025-03-26 18:27:57.896481+00:00,en,cbsnews.com,cbsnews.com,2025-03-26,Tufts University graduate student taken into c...,https://www.cbsnews.com/news/tufts-university-...
126,2d13d836eb0b1c348baa49c4d0abd2aeedd723e2dc3804...,2025-03-26 18:23:21.822574+00:00,en,independent.co.uk,independent.co.uk,2025-03-26,Trump news today live: Press secretary claims ...,https://www.independent.co.uk/news/world/ameri...
127,658033065e264846f5801e6fc2bdfb7f6efb3755599233...,2025-03-26 16:23:58.391051+00:00,en,independent.co.uk,independent.co.uk,2025-03-26,ICE arrests Tufts University doctoral student ...,https://www.independent.co.uk/news/world/ameri...
128,27af17de144d52f0817b9e15cfb6c74cf1499b39f72581...,2025-03-26 16:21:34.960777+00:00,en,nbcnews.com,nbcnews.com,2025-03-26,Federal immigration authorities detain interna...,https://www.nbcnews.com/news/us-news/federal-i...


In [121]:
def get_url_rows_for_media_name(df, media_name):
  if media_name == NYT_MEDIA_NAME:
    return df[(df['media_name']==NYT_MEDIA_NAME) & (~df['url'].str.contains('/video/'))]
  elif media_name == FOX_MEDIA_NAME:
    return df[(df['media_name']==FOX_MEDIA_NAME)]
  else:
    return []

def workflow_for_urls(urls, media_name):
  subdir = media_to_subdir[media_name]
  workflow = media_to_workflow_fn[media_name]
  for row in urls.iterrows():
    data = row[1]
    url = data['url']
    idee = data['id']

    # Do not download and read duplicates
    if exists(f'{DOWNLOAD_DIRECTORY}/{subdir}/{idee}.txt'):
      print(f'article id {idee} already exists')
      continue

    driver.get(url)
    workflow(driver, idee)
    time.sleep(5)

In [129]:
url_rows = get_url_rows_for_media_name(url_df, FOX_MEDIA_NAME)
if not len(url_rows) == 0:
  workflow_for_urls(url_rows, FOX_MEDIA_NAME)


In [127]:
def nyt_workflow(driver, mc_id):
  # Click terms and conditions update
  try:
    updated_terms_continue_btn = driver.find_element(By.XPATH, '//div[@data-campaign="MX_COMPLY_TERMS_OF_SERVICE_237"]//button')
    updated_terms_continue_btn.click()
  except NoSuchElementException:
    print('Could not find terms agreement dialogue')
  except ElementNotInteractableException:
    print()
  
  # Collect article text and write it to file
  els = driver.find_elements(By.XPATH, '//section[@name="articleBody"]//p')
  all_text = ''
  for el in els:
    all_text += el.get_attribute('innerHTML')

  with open(f'./{DOWNLOAD_DIRECTORY}/{NYT_SUBDIR}/{mc_id}.txt', 'w') as f:
    f.write(all_text)

def fox_workflow(driver, mc_id):
  els = driver.find_elements(By.XPATH, '//div[@class="paywall"]//p')
  all_text = ''
  for el in els:
    all_text += el.get_attribute('innerHTML')
  
  with open(f'./{DOWNLOAD_DIRECTORY}/{FOX_SUBDIR}/{mc_id}.txt', 'w') as f:
    f.write(all_text)

In [108]:
# driver.get('https://www.foxnews.com/media/tufts-university-student-arrest-generates-10x-more-coverage-from-legacy-news-than-ms-13-gang-leader-study')
fox_workflow(driver, 'test')

<strong>FIRST ON FOX</strong> – Legacy media outlets offered 10 times more coverage of a Turkish student who was arrested for allegedly <a href="https://www.foxnews.com/category/world/terrorism" target="_blank" rel="noopener">supporting terrorism</a> than the capture of an alleged MS-13 gang leader, according to a Media Research Center (MRC) study.&nbsp;
The FBI announced on Thursday that U.S. authorities captured the MS-13 top leader for the U.S. East Coast, 24-year-old Salvadoran national Henrry Josue Villatoro Santos. The news of the MS-13 boss being arrested came one day after Immigration and Customs Enforcement (ICE) detained Tufts University graduate student <a href="https://www.foxnews.com/us/video-shows-arrest-tufts-university-student-allegedly-supporting-hamas" target="_blank" rel="noopener">Rumeysa Ozturk</a> for allegedly supporting the Hamas terror group.&nbsp;
The conservative MRC tracked coverage of the arrests from March 26-27 and found that ABC, CBS, NBC, CNN and MSNBC 

In [None]:
# Function definitions

def scrape_body(url):
  '''
  Given a web url, issue an http GET request to get the entire HTML body of the
  page for later parsing.

  :param url: The url to fetch data from.
  '''
  # proxy = my_proxy("127.0.0.1", 9050)
  headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'}
  body = requests.get(url, headers=headers)
  # body = proxy.get(url, headers=headers)
  # proxy.get(url)
  # body = proxy.page_source
  # switchIP()
  return body.text

def parseNYTText(body):
  '''
  Use the BeautifulSoup library (a markup parser) to pull out the main story text
  from a New York Times article.

  :param body: Raw HTML text to feed into BeautifulSoup for further parsing.
  '''
  soup = BeautifulSoup(body)
  text = ''
  article_class = 'css-1r7ky0e'
  paragraph_class = 'css-axufdj'
  # text += str(soup.find(attrs={'class': article_class}))
  for p in soup.find_all(attrs={'class': paragraph_class}):
    text += p.text + "\n"
  return text


In [37]:
test_body = scrape_body(test_url)
parsed_text = parseNYTText(test_body)

print(test_body)

print(parsed_text)

<html lang="en"><head><title>nytimes.com</title><style>#cmsg{animation: A 1.5s;}@keyframes A{0%{opacity:0;}99%{opacity:0;}100%{opacity:1;}}</style></head><body style="margin:0"><p id="cmsg">Please enable JS and disable any ad blocker</p><script data-cfasync="false">var dd={'rt':'c','cid':'AHrlqAAAAAMAxRVu0J8Wbg8ALVzlyA==','hsh':'499AE34129FA4E4FABC31582C3075D','t':'fe','qp':'','s':17439,'e':'2b35d6c9042fdbd0a811b427897978ef8bff827f3ecde7f3fd5d251c18bd2b48','host':'geo.captcha-delivery.com','cookie':'~~Gy6l1hezJ2xp2km~2hA6CYuiAom1t64Av_KsAWKDhvDVLbxehXS_wbjc_Cw4VsLfoMrJG2c0xxZ695dyrOB3rban3ZXeYWgPlqZE2oPhtuNyx1WXkUZ342y4wHnMdA'}</script><script data-cfasync="false" src="https://ct.captcha-delivery.com/c.js"></script></body></html>

