In [1]:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from urllib.request import urlopen
import sys, time
import pandas as pd

In [2]:
def extract_news_from_jacobinmag(domain_name):
    article_urls = []
    article_title = []
    article_text = []
    url_name = {}
    # Extract News URL's from 3 pages
    for i in range(1,4): 
        try:
            url_name['url_' + str(i)] = "https://" + domain_name + "/search?query=media%20censorship&page=" + str(i)
            uClient = urlopen(url_name['url_' + str(i)])
            read_html = uClient.read()
            uClient.close()
        except:
            print("Cannot connect to " + domain_name)
        page_soup = soup(read_html, "html.parser")
        text_sections = page_soup.find("div", {"class" : "ar-mn__articles"}).find_all("a")
        article_urls_containing_duplicate = []
        for text in text_sections:
            if "/2020/" in text.get("href") or "/2019/" in text.get("href"):
                t = 'https://' + domain_name + text.get("href")
                article_urls_containing_duplicate.append(t)
        for i, url in enumerate(article_urls_containing_duplicate):
            if i % 2 != 0:
                article_urls.append(url)

    # We have the URL's, now we extract news from each URL
    for i, url in enumerate(article_urls):
        try:
            uClient = urlopen(url)
            read_html = uClient.read()
            uClient.close()
        except:
            print("Cannot connect to " + domain_name)
        page_soup = soup(read_html, "html.parser")
        title = page_soup.find("h1", {"class" : "po-hr-cn__title"})
        article_title.append(title.text)
        paragraphs = page_soup.find("div", {"id" : "post-content"}).find_all("p")
        intermediate_article_text = []
        for paragraph in paragraphs:
            intermediate_article_text.append(paragraph.text)
        article = "".join(intermediate_article_text)
        article_text.append(article)
        
    # Create a Dataframe
    dicti = {"Article_Heading" : article_title, "Article_Content" : article_text, "Source" : article_urls}
    df = pd.DataFrame(dicti)
    print("Extracted articles from Jacobinmag.com")
    return df 

In [3]:
def extract_news_from_nytimes(domain_name):
    article_urls = []
    article_title = []
    article_text = []
    
    url_2 = "https://" + domain_name + "/search?query=media+censorship"
    driver = webdriver.Chrome()
    try:
        driver.get(url_2)
        time.sleep(2)
    except:
        print("Cannot connect to " + domain_name)
        
    element = driver.find_element_by_xpath("//button[@data-testid='search-show-more-button']")
    element.click()
    time.sleep(5)
    element.click()
    time.sleep(5)
    element.click()
    time.sleep(10)
    
    page_soup = soup(driver.page_source, "html.parser")
    text_sections = page_soup.find("div", {"class" : "css-46b038"}).find_all("a")

    for text in text_sections:
        if "/2020/" in text.get("href") or "/2019/" in text.get("href"):
            t = 'https://' + domain_name + text.get("href")
            article_urls.append(t)

    for url in article_urls:
        try:
            uClient = urlopen(url)
            read_html = uClient.read()
            uClient.close()
        except:
            print("Cannot connect to " + domain_name)

        page_soup = soup(read_html, "html.parser")
        title = page_soup.find("h1", {"itemprop": "headline"})
        article_title.append(title.text)
        paragraphs = page_soup.find("section", {"class" : "meteredContent css-1r7ky0e"}).findAll("p")
        intermediate_article_text = []
        for paragraph in paragraphs:
            intermediate_article_text.append(paragraph.text)
        article = "".join(intermediate_article_text)
        article_text.append(article)

    dicti = {"Article_Heading":article_title, "Article_Content":article_text, "Source":article_urls}
    df = pd.DataFrame(dicti)
    print("Extracted articles from nytimes.com")
    return df

In [4]:
def extract_news_from_washington_post(domain_name):
    url_3 = ["https://"+ domain_name+ "/newssearch/?query=media%20censorship&sort=Relevance&datefilter=All%20Since%202005",
"https://"+ domain_name + "/newssearch/?query=media%20censorship&sort=Relevance&datefilter=All%20Since%202005&spellcheck&startat=20#top"]
    article_urls = []
    article_title = []
    article_text = []
    
    for i,url in enumerate(url_3):
        try:
            driver = webdriver.Chrome()
            driver.get(url)
            time.sleep(2)
        except:
            print("Cannot connect to " + domain_name)

        elements = driver.find_elements_by_xpath("//a[@href]")
        for i, elem in enumerate(elements):
            if '/2020/' in elem.get_attribute("href") and (i % 2) !=0:
                article_urls.append(elem.get_attribute("href"))
        article_urls = article_urls[:-2] 

    for i,url in enumerate(article_urls):
        try:
            uClient = urlopen(url)
            read_html = uClient.read()
            uClient.close()
        except:
            print("Caanot connect to " + domain_name)
        page_soup = soup(read_html, "html.parser")
        # title = page_soup.find("h1", {"data-qa": "headline"})
        title = page_soup.find("h1")
        article_title.append(title.text)
        paragraphs = page_soup.find_all("p", {"class" : "font--body font-copy gray-darkest ma-0 pb-md"})
        intermediate_article_text = []
        for paragraph in paragraphs:
            intermediate_article_text.append(paragraph.text)
        article = "".join(intermediate_article_text)
        article_text.append(article)
    
    dicti = {"Article_Heading":article_title, "Article_Content":article_text, "Source":article_urls}
    df = pd.DataFrame(dicti)
    print("Extracted articles from washingtonpost.com")
    return df

In [5]:
def main():
    domain_name_1 = "jacobinmag.com"
    domain_name_2 = "nytimes.com"
    domain_name_3 = "washingtonpost.com"
    df1 = extract_news_from_jacobinmag(domain_name_1)
    df2 = extract_news_from_nytimes(domain_name_2)
    df3 = extract_news_from_washington_post(domain_name_3)
    final_df = pd.concat([df1, df2, df3], ignore_index = True)
    final_df.to_excel('news_articles.xlsx')

In [6]:
if __name__ == "__main__":
    main()
    print("DONE!")

Extracted articles from Jacobinmag.com
DONE!
