# Prep 

In [1]:
import re
import os
import nltk
import time
import random
import numpy as np
import pandas as pd
from os import system
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from nltk.util import ngrams
from nltk import word_tokenize
from collections import Counter
from nltk.tokenize import RegexpTokenizer

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)

In [3]:
test1 = ['entertainment']
test2 = ['https://www.theonion.com/c/news-in-brief']

In [4]:
onion_same_format_topics = ['politics', 'sports', 'local', 'entertainment']
onion_diff_format_topic_urls = ['https://www.theonion.com/c/news-in-brief','https://www.theonion.com/tag/opinion']

In [5]:
categories = ['news', 'political', 'sport', 'entertainment']

## DRIVER HERE

In [6]:
driver = webdriver.Chrome('/usr/local/bin/chromedriver')

# Functions

## Formatting functions

In [8]:
def tokenize_text(text):
    tokenizer = RegexpTokenizer('[A-Za-z]\w+')
    all_tokens = tokenizer.tokenize(text)
    text = " ".join(all_tokens)
    return text

In [9]:
def tokenize_body(words):
    tokenizer = RegexpTokenizer('[A-Za-z]\w+')
    all_tokens = tokenizer.tokenize(words)
    case_insensitive = [token.lower() for token in all_tokens]
    
    bi_grams = list(ngrams(case_insensitive,2))
    bi_joined = [' '.join(gram) for gram in bi_grams]
    tri_grams = list(ngrams(case_insensitive,3))
    tri_joined = [' '.join(gram) for gram in tri_grams]
    
    case_insensitive.extend(bi_joined)
    case_insensitive.extend(tri_joined)
    return set(case_insensitive)

In [10]:
def find_categories(text_to_tokenize, categories):
    categories_dict = {1:'null',2:'null',3:'null',4:'null',5:'null'}
    words = tokenize_body(text_to_tokenize)
    intersection = list(words.intersection(categories))
    if len(intersection) > 0:
        for i in list(range(0,len(intersection))):
            categories_dict[i+1] = intersection[i]
    else:
        categories_dict[1] = 'general_satire'
    return categories_dict

## Inital Onion link scraping

In [7]:
def ignore_links_without_content(url, listy):
    
    if url.get_attribute('href')[:50] == 'https://entertainment.theonion.com/your-horoscopes':
        pass
    elif 'theonion.com/5-things-to-know-about' in url.get_attribute('href'):
        pass 
    else:
        listy.append(url.get_attribute('href'))

In [11]:
def scrape_links_same_format(topics):

    link_list = []
    
    for topic in topics:

# First page            
        driver.get('https://' + topic + '.theonion.com/')
        time.sleep(1.48)
        driver.execute_script('window.scrollTo(0, 900)')
        article_links = driver.find_elements_by_css_selector('h1')
        print('Getting', len(article_links), topic, 'article links')

        for l in article_links:
            link = l.find_element_by_css_selector('a')
            ignore_links_without_content(link, link_list)
            
# Second page                
        driver.find_element_by_link_text('More stories').click()
        time.sleep(.43)
        article_links_p2 = driver.find_elements_by_css_selector('h1')
        print('Getting', len(article_links_p2), 'more', topic, 'article links')

        for l in article_links_p2:
            link = l.find_element_by_css_selector('a')
            ignore_links_without_content(link, link_list)

# Third page             
        driver.find_element_by_link_text('More stories').click()
        time.sleep(1.43)
        article_links_p3 = driver.find_elements_by_css_selector('h1')
        print('Getting', len(article_links_p3), 'more', topic, 'article links')
                          
        for l in article_links_p3:
            link = l.find_element_by_css_selector('a')
            ignore_links_without_content(link, link_list)
        
    print('Total of', len(list(set(link_list))), 'article links scraped from first format of topics')
    print()
    results = list(set(link_list))
    return results

In [12]:
def scrape_links_diff_format(urls):

    link_list = []

    for url in urls:

# First page                
        driver.get(url)
        time.sleep(.48)

        if url == 'https://www.theonion.com/c/news-in-brief':
            topic = 'news-in-brief'
        elif url == 'https://www.theonion.com/tag/opinion':
            topic = 'opinion'
        driver.execute_script("window.scrollTo(0, 500)")
        time.sleep(.48)        
        article_links = driver.find_elements_by_class_name("headline")
        print('Getting', len(article_links), 'article links')

        for l in article_links:
            link = l.find_element_by_css_selector('a')
            ignore_links_without_content(link, link_list)

# Second page                    
        driver.find_element_by_link_text('More stories').click()
        time.sleep(1.78)        
        article_links_p2 = driver.find_elements_by_class_name("headline")
        print('Getting', len(article_links_p2), 'more', topic, 'article links')

        for l in article_links_p2:
            link = l.find_element_by_css_selector('a')
            ignore_links_without_content(link, link_list)

# Third page                               
        driver.find_element_by_link_text('More stories').click()
        time.sleep(.78)        
        article_links_p3 = driver.find_elements_by_class_name("headline")
        print('Getting', len(article_links_p3), 'more', topic, 'article links')
        
        for l in article_links_p3:
            link = l.find_element_by_css_selector('a')
            ignore_links_without_content(link, link_list)
            
        
    print('Total of', len(list(set(link_list))), 'article links scraped from second format of topics')
    print()
    results = list(set(link_list))
    return results

In [13]:
def scrape_onion_links(topics, urls):
    list1 = scrape_links_same_format(topics)
    list2 = scrape_links_diff_format(urls)
    all_onion_links = list(set(list1 + list2))
    all_onion_links.sort()
    print(all_onion_links)
    print()
    print(len(all_onion_links),'total ONION article links scrapes')
    return all_onion_links

---

## Scraping Onion link content

In [15]:
def scrape_onion_articles(urls):  
    ind = 1
    all_urls = []
    all_dates = []
    all_titles = []
    all_lengths = []
    body_contents = []
    all_topics1 = []
    all_topics2 = []
    all_topics3 = []
    all_topics4 = []
    all_topics5 = []
    source = 'Author not specified'
    source_id = 'The Onion'

    for url in urls:
        print('Working on #' + str(ind) + ' of '+ str(len(urls)) +' links')
        print()
        try:
            driver.get(url)
            time.sleep(1.1)
            body = driver.find_element_by_class_name('entry-content').text
            length = round(len(body) /5/ 250, 1)
            if length >= .5:
            # content = tokenize_text(body)
                date = driver.find_element_by_css_selector('time').find_element_by_css_selector('a').get_attribute('title')
                date = pd.to_datetime(date).date().strftime('%Y-%m-%d')
                title = driver.find_element_by_class_name('entry-title').text             
                if url not in all_urls:       
                    all_urls.append(url)
                    body_contents.append(body)
                    all_dates.append(date)
                    all_titles.append(title)
                    all_lengths.append(length)
                
                # ADDING CATEGORIES AT A LATER TIME FOR TOPIC MODELING
#                 category_dict = find_categories(content, categories)
#                 all_topics1.append(category_dict[0])
#                 all_topics2.append(category_dict[1])
#                 all_topics3.append(category_dict[2])
#                 all_topics4.append(category_dict[3])
#                 all_topics5.append(category_dict[4])

                else:
                    print("Duplicate link not added", ind)
                    pass
            else:
                print('Not worthy of scraping', ind)
                pass
        except Exception as e:
            print('Nothing to scrape for link #', str(ind) , e)
            pass
        ind += 1 
        
    df = pd.DataFrame()
    df['body_content'] = body_contents
    df['url'] = all_urls
    df['date'] = all_dates
    df['title'] = all_titles
    df['length'] = all_lengths
    df['source'] = source
    df['source_id'] = source_id
    df['satire_or_not'] = 'satire'
    df['label'] = 1

# ADDING CATEGORIES AT A LATER TIME FOR TOPIC MODELING    
#     df['topic_1'] = all_topics1
#     df['topic_2'] = all_topics2
#     df['topic_3'] = all_topics3
#     df['topic_4'] = all_topics4
#     df['topic_5'] = all_topics5

    df = df.drop_duplicates()
    df.index = range(len(df.index))


    return df

## Final scraping function

In [16]:
def scrape_the_onion():
    
#     onion_urls = scrape_onion_links(test1, test2)
    
    onion_urls = scrape_onion_links(onion_same_format_topics, onion_diff_format_topic_urls)

    onion_df = scrape_onion_articles(onion_urls)
    return onion_df

## Current output:

In [17]:
dfonion_test = scrape_the_onion()

Getting 20 politics article links
Getting 20 more politics article links
Getting 20 more politics article links
Getting 20 sports article links
Getting 20 more sports article links
Getting 20 more sports article links
Getting 20 local article links
Getting 20 more local article links
Getting 20 more local article links
Getting 20 entertainment article links
Getting 20 more entertainment article links
Getting 20 more entertainment article links
Total of 219 article links scraped from first format of topics

Getting 20 article links
Getting 20 more news-in-brief article links
Getting 20 more news-in-brief article links
Getting 20 article links
Getting 20 more opinion article links
Getting 20 more opinion article links
Total of 120 article links scraped from second format of topics

['https://entertainment.theonion.com/abc-camera-immediately-cuts-away-after-showing-harvey-w-1832855837', 'https://entertainment.theonion.com/anthem-developers-assure-players-whiteboard-with-word-1832762063', 

Not worthy of scraping 1
Working on #2 of 304 links

Working on #3 of 304 links

Not worthy of scraping 3
Working on #4 of 304 links

Working on #5 of 304 links

Working on #6 of 304 links

Working on #7 of 304 links

Not worthy of scraping 7
Working on #8 of 304 links

Working on #9 of 304 links

Working on #10 of 304 links

Working on #11 of 304 links

Working on #12 of 304 links

Working on #13 of 304 links

Not worthy of scraping 13
Working on #14 of 304 links

Working on #15 of 304 links

Working on #16 of 304 links

Not worthy of scraping 16
Working on #17 of 304 links

Not worthy of scraping 17
Working on #18 of 304 links

Working on #19 of 304 links

Not worthy of scraping 19
Working on #20 of 304 links

Working on #21 of 304 links

Working on #22 of 304 links

Not worthy of scraping 22
Working on #23 of 304 links

Not worthy of scraping 23
Working on #24 of 304 links

Working on #25 of 304 links

Working on #26 of 304 links

Working on #27 of 304 links

Working on #28 of 304 l

Working on #184 of 304 links

Not worthy of scraping 184
Working on #185 of 304 links

Working on #186 of 304 links

Not worthy of scraping 186
Working on #187 of 304 links

Working on #188 of 304 links

Not worthy of scraping 188
Working on #189 of 304 links

Working on #190 of 304 links

Not worthy of scraping 190
Working on #191 of 304 links

Working on #192 of 304 links

Not worthy of scraping 192
Working on #193 of 304 links

Working on #194 of 304 links

Working on #195 of 304 links

Not worthy of scraping 195
Working on #196 of 304 links

Working on #197 of 304 links

Working on #198 of 304 links

Working on #199 of 304 links

Nothing to scrape for link # 199 Message: no such element: Unable to locate element: {"method":"class name","selector":"entry-content"}
  (Session info: chrome=72.0.3626.119)
  (Driver info: chromedriver=2.45.615355 (d5698f682d8b2742017df6c81e0bd8e6a3063189),platform=Mac OS X 10.13.6 x86_64)

Working on #200 of 304 links

Nothing to scrape for link # 200 M

In [21]:
print(len(dfonion_test))
dfonion_test.head()

210


Unnamed: 0,body_content,url,date,title,length,source,source_id,satire_or_not,label
0,"EDMONTON, ALBERTA—Promising that they were tirelessly working to resolve issues with the long-an...",https://entertainment.theonion.com/anthem-developers-assure-players-whiteboard-with-word-1832762063,2019-02-20,‘Anthem’ Developers Assure Players Whiteboard With Words ‘Jetpack+Guns?’ Will Be Playable Game B...,1.1,Author not specified,The Onion,satire,1
1,"The Academy Awards inspire debate every year about whether certain films, directors, and actors ...",https://entertainment.theonion.com/biggest-snubs-in-oscars-history-1832755613,2019-02-20,Biggest Snubs In Oscars History,1.5,Author not specified,The Onion,satire,1
2,"BURBANK, CA—Emphasizing that losing even just a few comic book fans would be a “fate worse than ...",https://entertainment.theonion.com/bob-iger-at-disney-we-live-every-day-in-terror-that-y-1830987620,2018-12-10,"Bob Iger: At Disney, We Live Every Day In Terror That You’ll Turn On Superhero Movies",1.0,Author not specified,The Onion,satire,1
3,"FRAMINGHAM, MA—Calling the product the must-have item for true music connoisseurs, Bose official...",https://entertainment.theonion.com/bose-releases-new-headphones-specifically-optimized-for-18312...,2018-12-21,Bose Releases New Headphones Specifically Optimized For Listening To Whitney Houston’s ‘How Will...,0.9,Author not specified,The Onion,satire,1
4,"EUGENE, OR—Citing the potential threat to the safety of their children, a coalition of concerned...",https://entertainment.theonion.com/coalition-of-concerned-parents-condemns-video-games-fa-183065...,2018-11-26,Coalition Of Concerned Parents Condemns Video Games’ False Depiction Of How Easy It Is To Smash ...,1.5,Author not specified,The Onion,satire,1


---

----------

## For adding more to Onion portion of database (future updates)

In [13]:
def scrape_more_links_same_format(topics):

    link_list = []
    
    for topic in topics:

# First page
        driver.get('https://' + topic + '.theonion.com/')
        time.sleep(.48)
        driver.execute_script('window.scrollTo(0, 900)')
        article_links = driver.find_elements_by_css_selector('h1')
        print('Getting', len(article_links), topic, 'article links')
    
        for l in article_links:
            link = l.find_element_by_css_selector('a')
            ignore_links_without_content(link, link_list)

# Second page                    
        driver.find_element_by_link_text('More stories').click()
        time.sleep(.43)
        article_links_p2 = driver.find_elements_by_css_selector('h1')
        print('Getting', len(article_links_p2), 'more article links')
    
        for l in article_links_p2:
            link = l.find_element_by_css_selector('a')
            ignore_links_without_content(link, link_list)
        
    print('Total of', len(list(set(link_list))), 'article links scraped from first list')
    print()
    results = list(set(link_list))
    return results

In [14]:
def scrape_more_links_diff_format(urls):

    link_list = []

    for url in urls:

# First page
        driver.get(url)
        time.sleep(.48)

        if url == 'https://www.theonion.com/c/news-in-brief':
            topic = 'news-in-brief'
        elif url == 'https://www.theonion.com/tag/opinion':
            topic = 'opinion'
            
        driver.execute_script("window.scrollTo(0, 500)")
        article_links = driver.find_elements_by_class_name("headline")
        print('Getting', len(article_links), 'article links')
    
        for l in article_links:
            link = l.find_element_by_css_selector('a')
            link_list.append(link.get_attribute('href'))
            

# Second page
        driver.find_element_by_link_text('More stories').click()
        time.sleep(.78)        
        article_links_p2 = driver.find_elements_by_class_name("headline")
        print('Getting', len(article_links_p2), 'more', topic, 'article links')
    
        for l in article_links_p2:
            link = l.find_element_by_css_selector('a')
            link_list.append(link.get_attribute('href'))
        
    print('Total of', len(list(set(link_list))), 'article links scraped from first list')
    print()
    results = list(set(link_list))
    return results

In [15]:
def scrape_more_onion_links(topics, urls):
    list1 = scrape_more_links_same_format_topics(topics)
    list2 = scrape_more_links_diff_format(urls)
    all_onion_links = list(set(list1 + list2))
    all_onion_links.sort()
    print()
    print(len(all_onion_links),'total onion article links scrapes')
    return all_onion_links

# Tests

In [14]:
def scrape_onion_articles(urls):  
    ind = 1
    all_urls = []
    all_dates = []
    all_titles = []
    all_lengths = []
    body_contents = []
    all_categories1 = []
    all_categories2 = []
    all_categories3 = []
    all_categories4 = []
    all_categories5 = []
    source = 'Author not specified'
    source_id = 'The Onion'

    for url in urls:
        try:
            driver.get(url)
            time.sleep(1.1)
            body = driver.find_element_by_class_name('entry-content').text
#             content = tokenize_text(body)
            date = driver.find_element_by_css_selector('time').find_element_by_css_selector('a').get_attribute('title')
            date = pd.to_datetime(date).date().strftime('%Y-%m-%d')
            title = driver.find_element_by_class_name('entry-title').text             
            if url not in all_urls:       
                all_urls.append(url)
                print('Scraping #' + str(ind) + ' of '+ str(len(urls)) +' articles')
                print()
                body_contents.append(body)
                all_dates.append(date)
                all_titles.append(title)
                
                # ADDING CATEGORIES AT A LATER TIME FOR TOPIC MODELING
#                 category_dict = find_categories(content, categories)
#                 all_categories1.append(category_dict[0])
#                 all_categories2.append(category_dict[1])
#                 all_categories3.append(category_dict[2])
#                 all_categories4.append(category_dict[3])
#                 all_categories5.append(category_dict[4])

                length = round(len(body) /5/ 250, 1)
                if length >= .5:
                    all_lengths.append(length)
                else:
                    all_lengths.append(0)
            else:
                print("Duplicate link not added")
                pass
        except Exception as e:
            print('Nothing to scrape for link #', str(ind) , e)   
            pass
        ind += 1
        
    df = pd.DataFrame()
    df['body_content'] = body_contents
    df['url'] = all_urls
    df['date'] = all_dates
    df['title'] = all_titles
    df['length'] = all_lengths
    df['source'] = source
    df['source_id'] = source_id
    df['satire_or_not'] = 'satire'
    df['label'] = 1

# ADDING CATEGORIES AT A LATER TIME FOR TOPIC MODELING    
#     df['param_1'] = all_categories1
#     df['param_2'] = all_categories2
#     df['param_3'] = all_categories3
#     df['param_4'] = all_categories4
#     df['param_5'] = all_categories5

    df = df.drop_duplicates()
    df = df[df.length != 0]
    df.index = range(len(df.index))


    return df

In [84]:
driver = webdriver.Chrome('/usr/local/bin/chromedriver')

In [82]:
# title = driver.find_element_by_class_name('entry-title').text
# title

In [None]:
def find_categories(text_to_tokenize, categories):
    categories_dict = {1:'null',2:'null',3:'null',4:'null',5:'null'}
    words = tokenize_body(text_to_tokenize)
    intersection = list(words.intersection(categories))
    if len(intersection) > 0:
        for i in list(range(0,len(intersection))):
            categories_dict[i+1] = intersection[i]
    else:
        categories_dict[1] = 'general_satire'
    return categories_dict

In [108]:
onion_urls = scrape_onion_links(test1, test2)
# len(onion_urls)

Getting 20 entertainment article links
Getting 20 more entertainment article links
Getting 20 more entertainment article links
Total of 43 article links scraped from first format of topics

Getting 20 article links
Getting 20 more news-in-brief article links
Getting 20 more news-in-brief article links
Total of 60 article links scraped from second format of topics

['https://entertainment.theonion.com/anthem-developers-assure-players-whiteboard-with-word-1832762063', 'https://entertainment.theonion.com/back-to-back-broadcasts-of-big-happening-on-tbs-appar-1831183203', 'https://entertainment.theonion.com/biggest-snubs-in-oscars-history-1832755613', 'https://entertainment.theonion.com/bob-iger-at-disney-we-live-every-day-in-terror-that-y-1830987620', 'https://entertainment.theonion.com/bose-releases-new-headphones-specifically-optimized-for-1831258410', 'https://entertainment.theonion.com/bryan-singer-celebrates-bohemian-rhapsody-oscar-nomin-1831988697', 'https://entertainment.theonion.co

In [85]:
scrape_onion_articles(onion_urls)

Scraping #1 of 1 articles

Scraping #2 of 2 articles

Scraping #3 of 3 articles

Scraping #4 of 4 articles

Scraping #5 of 5 articles

Scraping #6 of 6 articles

Scraping #7 of 7 articles

Scraping #8 of 8 articles

Scraping #9 of 9 articles

Scraping #10 of 10 articles

Scraping #11 of 11 articles

Scraping #12 of 12 articles

Scraping #13 of 13 articles

Scraping #14 of 14 articles

Scraping #15 of 15 articles

Scraping #16 of 16 articles

Scraping #17 of 17 articles

Scraping #18 of 18 articles

Scraping #19 of 19 articles

Scraping #20 of 20 articles

Scraping #21 of 21 articles

Scraping #22 of 22 articles

Scraping #23 of 23 articles

Scraping #24 of 24 articles

Scraping #25 of 25 articles

Scraping #26 of 26 articles

Scraping #27 of 27 articles

Scraping #28 of 28 articles

Scraping #29 of 29 articles

Scraping #30 of 30 articles

Scraping #31 of 31 articles

Scraping #32 of 32 articles

Scraping #33 of 33 articles

Scraping #34 of 34 articles

Scraping #35 of 35 articles

Scr

Unnamed: 0,body_content,url,date,title,length,source,source_id,satire_or_not,label
0,"EDMONTON, ALBERTA—Promising that they were tir...",https://entertainment.theonion.com/anthem-deve...,2019-02-20,‘Anthem’ Developers Assure Players Whiteboard ...,4,Author not specified,The Onion,satire,1
1,Recent Video from The Onion\nVIEW MORE >\n5 Th...,https://entertainment.theonion.com/back-to-bac...,2018-12-18,Back-To-Back Broadcasts Of ‘Big’ Happening On ...,4,Author not specified,The Onion,satire,1
2,The Academy Awards inspire debate every year a...,https://entertainment.theonion.com/biggest-snu...,2019-02-20,Biggest Snubs In Oscars History,4,Author not specified,The Onion,satire,1
3,"BURBANK, CA—Emphasizing that losing even just ...",https://entertainment.theonion.com/bob-iger-at...,2018-12-10,"Bob Iger: At Disney, We Live Every Day In Terr...",4,Author not specified,The Onion,satire,1
4,"FRAMINGHAM, MA—Calling the product the must-ha...",https://entertainment.theonion.com/bose-releas...,2018-12-21,Bose Releases New Headphones Specifically Opti...,4,Author not specified,The Onion,satire,1
5,Recent Video from The Onion\nVIEW MORE >\n5 Th...,https://entertainment.theonion.com/bryan-singe...,2019-01-23,Bryan Singer Celebrates ‘Bohemian Rhapsody’ Os...,4,Author not specified,The Onion,satire,1
6,"EUGENE, OR—Citing the potential threat to the ...",https://entertainment.theonion.com/coalition-o...,2018-11-26,Coalition Of Concerned Parents Condemns Video ...,4,Author not specified,The Onion,satire,1
7,CHICAGO—Expressing their sincere regret for re...,https://entertainment.theonion.com/dave-matthe...,2018-11-28,Dave Matthews Band Apologizes After Tour Bus D...,4,Author not specified,The Onion,satire,1
8,"BURBANK, CA—Upon the release of the much-await...",https://entertainment.theonion.com/disney-anno...,2019-01-29,Disney Announces ‘Kingdom Hearts III’ Will Fea...,4,Author not specified,The Onion,satire,1
9,LONDON—Shedding light on her creative process ...,https://entertainment.theonion.com/e-l-james-a...,2019-01-24,E.L. James Admits New Erotic Novel Originally ...,4,Author not specified,The Onion,satire,1


In [132]:
dfonion1 = scrape_the_onion()

Getting 20 politics article links
Getting 20 more politics article links
Getting 20 more politics article links
Getting 20 sports article links
Getting 20 more sports article links
Getting 20 more sports article links
Getting 20 local article links
Getting 20 more local article links
Getting 20 more local article links
Getting 20 entertainment article links
Getting 20 more entertainment article links
Getting 20 more entertainment article links
Total of 218 article links scraped from first format of topics

Getting 20 article links
Getting 20 more news-in-brief article links
Getting 20 more news-in-brief article links
Getting 20 article links
Getting 20 more opinion article links
Getting 20 more opinion article links
Total of 120 article links scraped from second format of topics

['https://entertainment.theonion.com/abc-camera-immediately-cuts-away-after-showing-harvey-w-1832855837', 'https://entertainment.theonion.com/anthem-developers-assure-players-whiteboard-with-word-1832762063', 

Not worthy of scraping 1
Working on #2 of 304 links

Working on #3 of 304 links

Not worthy of scraping 3
Working on #4 of 304 links

Working on #5 of 304 links

Working on #6 of 304 links

Working on #7 of 304 links

Not worthy of scraping 7
Working on #8 of 304 links

Working on #9 of 304 links

Working on #10 of 304 links

Working on #11 of 304 links

Working on #12 of 304 links

Working on #13 of 304 links

Working on #14 of 304 links

Not worthy of scraping 14
Working on #15 of 304 links

Working on #16 of 304 links

Working on #17 of 304 links

Not worthy of scraping 17
Working on #18 of 304 links

Not worthy of scraping 18
Working on #19 of 304 links

Working on #20 of 304 links

Not worthy of scraping 20
Working on #21 of 304 links

Working on #22 of 304 links

Working on #23 of 304 links

Not worthy of scraping 23
Working on #24 of 304 links

Not worthy of scraping 24
Working on #25 of 304 links

Working on #26 of 304 links

Working on #27 of 304 links

Working on #28 of 304 l

Working on #174 of 304 links

Working on #175 of 304 links

Working on #176 of 304 links

Working on #177 of 304 links

Not worthy of scraping 177
Working on #178 of 304 links

Working on #179 of 304 links

Not worthy of scraping 179
Working on #180 of 304 links

Working on #181 of 304 links

Working on #182 of 304 links

Working on #183 of 304 links

Not worthy of scraping 183
Working on #184 of 304 links

Working on #185 of 304 links

Not worthy of scraping 185
Working on #186 of 304 links

Working on #187 of 304 links

Not worthy of scraping 187
Working on #188 of 304 links

Working on #189 of 304 links

Not worthy of scraping 189
Working on #190 of 304 links

Working on #191 of 304 links

Working on #192 of 304 links

Working on #193 of 304 links

Not worthy of scraping 193
Working on #194 of 304 links

Working on #195 of 304 links

Working on #196 of 304 links

Working on #197 of 304 links

Nothing to scrape for link # 197 Message: no such element: Unable to locate element: {"meth

Working on #304 of 304 links

Not worthy of scraping 304


In [133]:
dfonion1

Unnamed: 0,body_content,url,date,title,length,source,source_id,satire_or_not,label
0,"EDMONTON, ALBERTA—Promising that they were tirelessly working to resolve issues with the long-an...",https://entertainment.theonion.com/anthem-developers-assure-players-whiteboard-with-word-1832762063,2019-02-20,‘Anthem’ Developers Assure Players Whiteboard With Words ‘Jetpack+Guns?’ Will Be Playable Game B...,1.0,Author not specified,The Onion,satire,1
1,"The Academy Awards inspire debate every year about whether certain films, directors, and actors ...",https://entertainment.theonion.com/biggest-snubs-in-oscars-history-1832755613,2019-02-20,Biggest Snubs In Oscars History,1.5,Author not specified,The Onion,satire,1
2,"BURBANK, CA—Emphasizing that losing even just a few comic book fans would be a “fate worse than ...",https://entertainment.theonion.com/bob-iger-at-disney-we-live-every-day-in-terror-that-y-1830987620,2018-12-10,"Bob Iger: At Disney, We Live Every Day In Terror That You’ll Turn On Superhero Movies",0.9,Author not specified,The Onion,satire,1
3,"FRAMINGHAM, MA—Calling the product the must-have item for true music connoisseurs, Bose official...",https://entertainment.theonion.com/bose-releases-new-headphones-specifically-optimized-for-18312...,2018-12-21,Bose Releases New Headphones Specifically Optimized For Listening To Whitney Houston’s ‘How Will...,0.9,Author not specified,The Onion,satire,1
4,"EUGENE, OR—Citing the potential threat to the safety of their children, a coalition of concerned...",https://entertainment.theonion.com/coalition-of-concerned-parents-condemns-video-games-fa-183065...,2018-11-26,Coalition Of Concerned Parents Condemns Video Games’ False Depiction Of How Easy It Is To Smash ...,1.4,Author not specified,The Onion,satire,1
5,"CHICAGO—Expressing their sincere regret for repeating the 2004 incident, representatives for Dav...",https://entertainment.theonion.com/dave-matthews-band-apologizes-after-tour-bus-dumps-anot-18307...,2018-11-28,Dave Matthews Band Apologizes After Tour Bus Dumps Another 800 Pounds Of Human Shit Onto Same Bo...,1.1,Author not specified,The Onion,satire,1
6,"BURBANK, CA—Upon the release of the much-awaited third installment of the popular video game fra...",https://entertainment.theonion.com/disney-announces-kingdom-hearts-iii-will-feature-erne-1832156301,2019-01-29,"Disney Announces ‘Kingdom Hearts III’ Will Feature Ernest, Turner, Hooch, And All The Rest Of Yo...",1.1,Author not specified,The Onion,satire,1
7,"LONDON—Shedding light on her creative process amid announcements of her forthcoming book, Fifty ...",https://entertainment.theonion.com/e-l-james-admits-new-erotic-novel-originally-tiny-too-1832029592,2019-01-24,E.L. James Admits New Erotic Novel Originally ‘Tiny Toons’ Fan Fiction,1.0,Author not specified,The Onion,satire,1
8,"BROOKLYN, NY—Shocked, disillusioned, and even somewhat betrayed by the unlikely pairing, fans of...",https://entertainment.theonion.com/fans-shocked-after-marie-kondo-reveals-she-has-been-dat-18318...,2019-01-18,Fans Shocked After Marie Kondo Reveals She Has Been Dating Untidy Cupboard For Past 6 Months,1.2,Author not specified,The Onion,satire,1
9,"LOS ANGELES—In a concentrated effort to ensure the movie set felt like a safe, supportive place ...",https://entertainment.theonion.com/female-director-asked-if-she-feels-comfortable-filming-183043...,2018-11-14,Female Director Asked If She Feels Comfortable Filming Scene While Nude,0.7,Author not specified,The Onion,satire,1


In [124]:
# dfonion.index = range(len(dfonion.index))
# dfonion.length.max()
dfonion

Unnamed: 0,body_content,url,date,title,length,source,source_id,satire_or_not,label
0,"EDMONTON, ALBERTA—Promising that they were tirelessly working to resolve issues with the long-an...",https://entertainment.theonion.com/anthem-developers-assure-players-whiteboard-with-word-1832762063,2019-02-20,‘Anthem’ Developers Assure Players Whiteboard With Words ‘Jetpack+Guns?’ Will Be Playable Game B...,1.1,Author not specified,The Onion,satire,1
1,"The Academy Awards inspire debate every year about whether certain films, directors, and actors ...",https://entertainment.theonion.com/biggest-snubs-in-oscars-history-1832755613,2019-02-20,Biggest Snubs In Oscars History,1.5,Author not specified,The Onion,satire,1
2,"BURBANK, CA—Emphasizing that losing even just a few comic book fans would be a “fate worse than ...",https://entertainment.theonion.com/bob-iger-at-disney-we-live-every-day-in-terror-that-y-1830987620,2018-12-10,"Bob Iger: At Disney, We Live Every Day In Terror That You’ll Turn On Superhero Movies",1.0,Author not specified,The Onion,satire,1
3,"FRAMINGHAM, MA—Calling the product the must-have item for true music connoisseurs, Bose official...",https://entertainment.theonion.com/bose-releases-new-headphones-specifically-optimized-for-18312...,2018-12-21,Bose Releases New Headphones Specifically Optimized For Listening To Whitney Houston’s ‘How Will...,1.0,Author not specified,The Onion,satire,1
4,"EUGENE, OR—Citing the potential threat to the safety of their children, a coalition of concerned...",https://entertainment.theonion.com/coalition-of-concerned-parents-condemns-video-games-fa-183065...,2018-11-26,Coalition Of Concerned Parents Condemns Video Games’ False Depiction Of How Easy It Is To Smash ...,1.5,Author not specified,The Onion,satire,1
5,"CHICAGO—Expressing their sincere regret for repeating the 2004 incident, representatives for Dav...",https://entertainment.theonion.com/dave-matthews-band-apologizes-after-tour-bus-dumps-anot-18307...,2018-11-28,Dave Matthews Band Apologizes After Tour Bus Dumps Another 800 Pounds Of Human Shit Onto Same Bo...,1.1,Author not specified,The Onion,satire,1
6,"BURBANK, CA—Upon the release of the much-awaited third installment of the popular video game fra...",https://entertainment.theonion.com/disney-announces-kingdom-hearts-iii-will-feature-erne-1832156301,2019-01-29,"Disney Announces ‘Kingdom Hearts III’ Will Feature Ernest, Turner, Hooch, And All The Rest Of Yo...",1.2,Author not specified,The Onion,satire,1
7,"LONDON—Shedding light on her creative process amid announcements of her forthcoming book, Fifty ...",https://entertainment.theonion.com/e-l-james-admits-new-erotic-novel-originally-tiny-too-1832029592,2019-01-24,E.L. James Admits New Erotic Novel Originally ‘Tiny Toons’ Fan Fiction,1.1,Author not specified,The Onion,satire,1
8,"BROOKLYN, NY—Shocked, disillusioned, and even somewhat betrayed by the unlikely pairing, fans of...",https://entertainment.theonion.com/fans-shocked-after-marie-kondo-reveals-she-has-been-dat-18318...,2019-01-18,Fans Shocked After Marie Kondo Reveals She Has Been Dating Untidy Cupboard For Past 6 Months,1.3,Author not specified,The Onion,satire,1
9,"LOS ANGELES—In a concentrated effort to ensure the movie set felt like a safe, supportive place ...",https://entertainment.theonion.com/female-director-asked-if-she-feels-comfortable-filming-183043...,2018-11-14,Female Director Asked If She Feels Comfortable Filming Scene While Nude,0.8,Author not specified,The Onion,satire,1


In [126]:
scrape_onion_articles(['https://entertainment.theonion.com/anthem-developers-assure-players-whiteboard-with-word-1832762063', 'https://entertainment.theonion.com/back-to-back-broadcasts-of-big-happening-on-tbs-appar-1831183203', 'https://entertainment.theonion.com/biggest-snubs-in-oscars-history-1832755613', 'https://entertainment.theonion.com/bob-iger-at-disney-we-live-every-day-in-terror-that-y-1830987620', 'https://entertainment.theonion.com/bose-releases-new-headphones-specifically-optimized-for-1831258410', 'https://entertainment.theonion.com/bryan-singer-celebrates-bohemian-rhapsody-oscar-nomin-1831988697', 'https://entertainment.theonion.com/coalition-of-concerned-parents-condemns-video-games-fa-1830656761', 'https://entertainment.theonion.com/dave-matthews-band-apologizes-after-tour-bus-dumps-anot-1830712805', 'https://entertainment.theonion.com/disney-announces-kingdom-hearts-iii-will-feature-erne-1832156301', 'https://entertainment.theonion.com/e-l-james-admits-new-erotic-novel-originally-tiny-too-1832029592', 'https://entertainment.theonion.com/fans-shocked-after-marie-kondo-reveals-she-has-been-dat-1831872362', 'https://entertainment.theonion.com/female-director-asked-if-she-feels-comfortable-filming-1830439638', 'https://entertainment.theonion.com/fox-searchlight-purchases-two-hours-of-super-bowl-air-t-1832306608', 'https://entertainment.theonion.com/game-of-thrones-fans-excited-to-hear-series-will-fina-1831742277', 'https://entertainment.theonion.com/game-of-thrones-fans-now-just-hoping-george-r-r-mart-1830983834', 'https://entertainment.theonion.com/jason-momoa-clearly-came-to-oscars-straight-from-work-1832855685', 'https://entertainment.theonion.com/jussie-smollett-arrives-in-court-wearing-full-body-cast-1832800613', 'https://entertainment.theonion.com/kanye-west-named-new-face-of-yeezy-1832335816', 'https://entertainment.theonion.com/leaked-george-lucas-sex-tape-includes-digitally-inserte-1832816253', 'https://entertainment.theonion.com/les-moonves-doesn-t-know-how-he-going-to-tell-wife-he-d-1831181205', 'https://entertainment.theonion.com/man-worried-the-6th-transformers-movie-will-just-be-s-1830940714', 'https://entertainment.theonion.com/matt-lauer-spending-more-time-with-friends-family-afte-1830753295', 'https://entertainment.theonion.com/millions-of-americans-shocked-to-discover-favorite-movi-1831182722', 'https://entertainment.theonion.com/most-anticipated-tv-shows-of-2019-1831994027', 'https://entertainment.theonion.com/most-memorable-oscars-speeches-of-all-time-1832857253', 'https://entertainment.theonion.com/new-avengers-fan-theory-suggests-key-to-beating-thano-1832362299', 'https://entertainment.theonion.com/new-evidence-in-murder-case-leads-authorities-to-narrow-1832130653', 'https://entertainment.theonion.com/new-game-of-thrones-trailer-confirms-season-8-will-re-1832195359', 'https://entertainment.theonion.com/new-game-of-thrones-trailer-reveals-final-season-will-1830420524', 'https://entertainment.theonion.com/oliver-stone-thriller-individual-1-already-written-f-1830778812', 'https://entertainment.theonion.com/oscars-gift-bag-includes-3-ipads-streaming-telecast-in-1832855780', 'https://entertainment.theonion.com/paul-mccartney-saddened-after-learning-about-death-of-l-1830656527', 'https://entertainment.theonion.com/producer-tells-actress-non-disclosure-agreement-pretty-1830879809', 'https://entertainment.theonion.com/r-b-singer-guesses-she-ll-just-keep-moaning-into-mic-un-1831735495', 'https://entertainment.theonion.com/radio-station-playing-controversial-little-drummer-boy-1831177978', 'https://entertainment.theonion.com/rock-roll-hall-of-fame-rescinds-nomination-after-disc-1831078351', 'https://entertainment.theonion.com/study-no-two-people-have-listened-to-same-band-since-2-1832012299', 'https://entertainment.theonion.com/sweating-exhausted-christian-bale-stumbles-past-13-mil-1832855729', 'https://entertainment.theonion.com/the-conners-scores-big-ratings-by-killing-off-rest-of-1832030980', 'https://entertainment.theonion.com/the-onion-s-2019-grammy-predictions-1832392374', 'https://entertainment.theonion.com/top-tv-shows-of-2018-1831298037', 'https://entertainment.theonion.com/walking-dead-fans-split-on-recent-harlem-globetrotter-1830775810', 'https://entertainment.theonion.com/winner-of-the-voice-excited-to-use-50-chili-s-gift-c-1831211901', 'https://entertainment.theonion.com/you-know-i-directed-it-too-bradley-cooper-says-out-1832855758', 'https://local.theonion.com/7-total-randos-found-dead-1832330983', 'https://local.theonion.com/allergy-sufferer-dies-after-being-stung-by-dog-1832020319', 'https://local.theonion.com/aunt-scores-big-with-nephews-by-dropping-bombshell-stor-1832702723', 'https://local.theonion.com/boss-makes-lipstick-prints-on-paychecks-for-valentine-s-1832626985', 'https://local.theonion.com/bouncer-who-s-not-that-big-must-be-fucking-crazy-1832782672', 'https://local.theonion.com/character-witness-told-he-doesn-t-have-what-it-takes-to-1832560501', 'https://local.theonion.com/chinese-man-just-glad-fuckin-4716-over-1832363781', 'https://local.theonion.com/cop-just-in-it-for-the-frisking-1832227599', 'https://local.theonion.com/couple-duetting-suddenly-seymour-at-karaoke-bar-proba-1832461278', 'https://local.theonion.com/coworkers-agog-as-employee-introduces-new-shirt-into-ro-1832702552', 'https://local.theonion.com/dad-doesn-t-trust-the-fish-here-1832012365', 'https://local.theonion.com/dad-heartbreakingly-thinks-his-connections-can-help-son-1832329368', 'https://local.theonion.com/disconcerted-woman-has-no-memory-of-telling-dressing-ro-1832396377', 'https://local.theonion.com/doctor-weirded-out-by-patient-she-just-met-providing-ev-1831914381', 'https://local.theonion.com/doctors-assure-recovering-patient-he-has-many-more-year-1831993642', 'https://local.theonion.com/dog-can-t-believe-owner-left-on-fucking-msnbc-to-keep-i-1832791796', 'https://local.theonion.com/family-unsure-why-grandmother-s-caregiver-seems-like-he-1832528613', 'https://local.theonion.com/furloughed-government-employee-using-time-off-to-visit-1831802595', 'https://local.theonion.com/grandma-s-metoo-stories-fucking-horrifying-1832121994', 'https://local.theonion.com/grocery-store-not-fooling-anybody-by-marketing-cantalou-1832303658', 'https://local.theonion.com/homicide-detective-wishes-he-could-go-one-case-without-1832432518', 'https://local.theonion.com/lazy-poor-person-has-never-earned-passive-income-from-s-1832537497', 'https://local.theonion.com/local-brother-in-law-heard-you-can-make-shitload-of-mon-1832125865', 'https://local.theonion.com/lonely-elderly-man-visits-pond-to-pelt-ducks-with-rocks-1832394962', 'https://local.theonion.com/man-always-makes-sure-to-put-phone-on-silent-before-mis-1832702681', 'https://local.theonion.com/man-beginning-to-worry-that-best-meals-already-behind-h-1831914306', 'https://local.theonion.com/man-competitive-about-how-depressed-he-is-1832759116', 'https://local.theonion.com/man-crouched-inside-of-robotic-welding-arm-terrified-ro-1831766013', 'https://local.theonion.com/man-hoping-girlfriend-doesn-t-notice-valentine-s-day-gi-1832611790', 'https://local.theonion.com/man-losing-respect-for-incompetent-boss-who-won-t-fire-1832155974', 'https://local.theonion.com/man-nervous-about-telling-date-he-has-her-kids-1831836965', 'https://local.theonion.com/man-regrets-wasting-money-on-college-after-failing-to-s-1831951654', 'https://local.theonion.com/man-s-wife-dies-of-cancer-just-like-in-the-movies-1832193130', 'https://local.theonion.com/man-worried-experiences-of-cancun-trip-far-too-complex-1832625856', 'https://local.theonion.com/masked-vigilante-takes-terrorizing-black-community-into-1832019922', 'https://local.theonion.com/minnesota-resident-thinking-of-finally-packing-it-all-u-1832169119', 'https://local.theonion.com/nature-preserve-sets-up-unrealistic-expectations-with-v-1831983590', 'https://local.theonion.com/new-mom-self-conscious-about-scar-where-baby-punched-it-1832262271', 'https://local.theonion.com/older-cafeteria-monitor-not-a-teacher-or-parent-or-anyt-1832163120', 'https://local.theonion.com/parents-visit-injects-66-into-local-apartment-economy-1832233302', 'https://local.theonion.com/passenger-glued-to-airplane-window-like-it-fucking-1956-1832728158', 'https://local.theonion.com/pedestrian-crossing-street-makes-sure-to-look-at-approa-1832430875', 'https://local.theonion.com/pretentious-woman-refers-to-slam-piece-as-partner-1831803116', 'https://local.theonion.com/report-papa-will-be-so-very-cross-you-ve-lost-grandfat-1832395994', 'https://local.theonion.com/report-there-no-way-of-knowing-whether-the-vague-award-1831816021', 'https://local.theonion.com/self-conscious-puppet-has-no-idea-what-to-do-with-hands-1831983895', 'https://local.theonion.com/sensei-s-assistant-really-getting-his-ass-whipped-1832590633', 'https://local.theonion.com/teen-on-verge-of-either-joining-isis-or-getting-super-i-1832824482', 'https://local.theonion.com/teen-weirded-out-after-running-over-english-teacher-out-1832025329', 'https://local.theonion.com/that-first-date-is-going-terribly-think-diners-watch-1832727666', 'https://local.theonion.com/this-actually-good-news-contractor-reveals-because-no-1832612071', 'https://local.theonion.com/this-the-fuck-harness-sex-shop-worker-has-at-home-1831840790', 'https://local.theonion.com/ultrasound-technician-asks-pregnant-woman-if-she-d-like-1832561842', 'https://local.theonion.com/unclear-why-stagehand-wrote-heartfelt-little-notes-to-e-1832796319', 'https://local.theonion.com/weak-willed-intellectual-infant-checks-to-see-how-many-1831980847', 'https://local.theonion.com/weird-kid-opts-to-sit-perfectly-still-let-universe-dec-1831960660', 'https://local.theonion.com/what-a-crew-comments-man-on-instagram-photo-of-fucki-1832428344', 'https://local.theonion.com/woman-didn-t-know-progress-on-toxic-masculinity-would-t-1831869468', 'https://local.theonion.com/woman-rushes-to-hide-fragile-objects-cover-up-sharp-co-1831812626', 'https://local.theonion.com/woman-wakes-husband-up-on-valentine-s-day-with-hot-surp-1832612036', 'https://politics.theonion.com/alexandria-ocasio-cortez-criticized-for-preventing-25-0-1832652435', 'https://politics.theonion.com/amy-klobuchar-pledges-to-fight-everyday-americans-1832539124', 'https://politics.theonion.com/angry-ranting-twitter-user-really-needs-to-move-out-of-1832570014', 'https://politics.theonion.com/ann-coulter-attacks-trump-for-cowardly-backing-down-fro-1832656634', 'https://politics.theonion.com/chuck-grassley-reveals-he-voted-against-mlk-day-due-to-1831958615', 'https://politics.theonion.com/cold-panic-grips-stacey-abrams-as-trump-begins-deliveri-1832374238', 'https://politics.theonion.com/congress-agrees-to-1-3-billion-for-protective-border-f-1832570683', 'https://politics.theonion.com/cory-booker-apologizes-to-wall-street-bankers-for-the-m-1832268385', 'https://politics.theonion.com/defiant-pelosi-begins-swimming-to-afghanistan-after-tru-1831878963', 'https://politics.theonion.com/democratic-presidential-candidates-endorse-new-medicar-1832243277', 'https://politics.theonion.com/dick-cheney-finally-hunts-down-kills-man-he-shot-in-fa-1832167962', 'https://politics.theonion.com/don-t-make-me-regret-this-mueller-tells-rick-gates-b-1831849285', 'https://politics.theonion.com/elliott-abrams-defends-war-crimes-as-happening-back-in-1832632902', 'https://politics.theonion.com/emaciated-peter-alexander-burns-podium-for-warmth-after-1832026399', 'https://politics.theonion.com/follow-your-instructions-this-is-all-part-of-the-plan-1832160706', 'https://politics.theonion.com/giuliani-clarifies-he-doesn-t-want-gravestone-to-say-h-1831964695', 'https://politics.theonion.com/giuliani-let-s-just-start-everything-over-1831966306', 'https://politics.theonion.com/guests-chairs-tilt-spray-water-at-them-during-first-e-1832374161', 'https://politics.theonion.com/guilt-ridden-stacey-abrams-wondering-when-she-should-te-1832367801', 'https://politics.theonion.com/history-of-independent-presidential-candidates-1832362206', 'https://politics.theonion.com/howard-schultz-considering-independent-presidential-run-1832126653', 'https://politics.theonion.com/if-this-report-is-true-to-be-repeated-5-7-billion-tim-1831880246', 'https://politics.theonion.com/ilhan-omar-thankful-for-colleagues-educating-her-on-pai-1832540739', 'https://politics.theonion.com/jared-kushner-assures-reporters-he-never-revealed-state-1832056658', 'https://politics.theonion.com/jimmy-carter-concerned-desire-for-fresh-faces-in-democr-1832200632', 'https://politics.theonion.com/john-bolton-insists-iran-likely-harboring-dangerous-ter-1831852419', 'https://politics.theonion.com/judge-restricts-roger-stone-s-travel-between-fox-news-1832057059', 'https://politics.theonion.com/kamala-harris-assembles-campaign-staff-of-unpaid-califo-1831958905', 'https://politics.theonion.com/karen-pence-returns-to-work-as-part-time-nude-art-model-1831845562', 'https://politics.theonion.com/kavanaugh-offers-elena-kagan-pull-of-vodka-from-aquafin-1832370913', 'https://politics.theonion.com/mueller-admits-a-smarter-president-would-ve-totally-fou-1832790637', 'https://politics.theonion.com/mueller-annoyed-by-chipper-overeager-adam-schiff-const-1832468027', 'https://politics.theonion.com/nancy-pelosi-signals-support-for-environmental-causes-b-1832437461', 'https://politics.theonion.com/nation-horrified-to-discover-cory-booker-already-a-sena-1832568259', 'https://politics.theonion.com/parasitic-space-worm-controlling-mark-kelly-s-body-anno-1832564499', 'https://politics.theonion.com/ralph-northam-admits-he-once-engaged-in-pedophilia-as-p-1832405275', 'https://politics.theonion.com/report-mueller-investigation-nearly-done-with-first-da-1832163270', 'https://politics.theonion.com/report-west-virginia-feeling-pretty-smug-right-about-n-1832402532', 'https://politics.theonion.com/report-white-house-overruled-intelligence-officials-fo-1832054895', 'https://politics.theonion.com/someone-in-this-room-tonight-will-be-murdered-by-an-il-1832370781', 'https://politics.theonion.com/trump-admits-he-assumed-roger-stone-was-already-in-pris-1832057622', 'https://politics.theonion.com/trump-agrees-to-wear-wire-to-take-down-roger-stone-1832759793', 'https://politics.theonion.com/trump-base-celebrates-president-for-standing-up-to-cons-1832659448', 'https://politics.theonion.com/trump-claims-greatest-threat-facing-nation-toys-coming-1832370525', 'https://politics.theonion.com/trump-confirms-all-violent-options-on-the-table-in-vene-1832729737', 'https://politics.theonion.com/trump-covered-in-own-shit-after-furloughed-white-house-1831966461', 'https://politics.theonion.com/trump-demands-william-barr-prove-loyalty-by-putting-gun-1832816704', 'https://politics.theonion.com/trump-dismisses-trump-as-a-distraction-1831881924', 'https://politics.theonion.com/trump-invites-supporter-bbc-cameraman-to-finish-alterc-1832572550', 'https://politics.theonion.com/trump-offers-clear-historical-precedent-for-deploying-1832659857', 'https://politics.theonion.com/trump-postpones-grand-opening-of-trump-tower-moscow-unt-1831880688', 'https://politics.theonion.com/we-will-not-repeat-the-mistakes-of-the-2016-election-1832763775', 'https://politics.theonion.com/winded-trump-forced-to-lay-down-for-last-half-of-speech-1832379201', 'https://sports.theonion.com/adam-silver-sends-league-wide-memo-just-asking-players-1832465757', 'https://sports.theonion.com/bryce-harper-asks-if-phillies-willing-to-move-to-anothe-1832462398', 'https://sports.theonion.com/climatologists-find-pitchers-and-catchers-reporting-fur-1832655368', 'https://sports.theonion.com/could-this-be-the-last-season-we-see-rob-gronkowski-ful-1831849765', 'https://sports.theonion.com/diamondbacks-settle-long-standing-civil-suit-with-offsp-1832815490', 'https://sports.theonion.com/does-amari-cooper-s-experience-playing-under-a-terrible-1830006308', 'https://sports.theonion.com/does-thanksgiving-glorify-the-historical-slaughter-of-t-1830589913', 'https://sports.theonion.com/does-the-rams-success-point-toward-a-league-wide-trend-1832306501', 'https://sports.theonion.com/entirety-of-browns-dawg-pound-euthanized-following-rabi-1831165471', 'https://sports.theonion.com/giant-demonic-hand-bursts-out-of-ground-drags-bill-bel-1832310427', 'https://sports.theonion.com/health-experts-say-tackle-football-poses-little-risk-fo-1830546856', 'https://sports.theonion.com/indignant-bill-belichick-evades-question-by-cutting-off-1832156094', 'https://sports.theonion.com/inexperienced-puppy-bowl-team-still-hasn-t-opened-eyes-1832304815', 'https://sports.theonion.com/is-cindy-gruden-worth-more-than-the-7th-round-pick-jon-1830339142', 'https://sports.theonion.com/jared-goff-pissed-he-had-to-miss-friend-s-super-bowl-pa-1832308535', 'https://sports.theonion.com/joe-buck-tears-rotator-cuff-after-awkward-throw-down-to-1831265950', 'https://sports.theonion.com/knicks-confident-they-have-the-cap-space-to-ruin-2-or-3-1832729578', 'https://sports.theonion.com/lebron-and-lakers-hoping-horrible-series-of-failed-betr-1832537573', 'https://sports.theonion.com/luke-walton-inspires-lakers-with-story-about-zero-point-1830133981', 'https://sports.theonion.com/manny-machado-denies-playing-dirty-after-late-slide-int-1829875292', 'https://sports.theonion.com/mavericks-retire-dirk-nowitzki-s-jersey-fully-unaware-h-1832068528', 'https://sports.theonion.com/mlb-hoping-to-boost-attendance-at-league-meetings-with-1830991212', 'https://sports.theonion.com/modern-day-rudy-fulfills-dream-of-showering-with-notre-1830627943', 'https://sports.theonion.com/nba-ref-petrified-after-seeing-depiction-of-own-death-w-1831776345', 'https://sports.theonion.com/nbc-unveils-on-screen-graphic-informing-audience-they-a-1830512587', 'https://sports.theonion.com/nfl-defends-right-to-subject-eric-reid-to-random-stop-a-1831268180', 'https://sports.theonion.com/nike-fires-8-year-old-shoemaker-responsible-for-zion-wi-1832801206', 'https://sports.theonion.com/no-god-please-not-now-yells-rapidly-aging-tom-brady-1832306473', 'https://sports.theonion.com/now-that-s-what-i-call-a-fumble-reports-man-at-super-1832308685', 'https://sports.theonion.com/nurse-tells-wheelchair-bound-concussed-rob-gronkowski-1832306558', 'https://sports.theonion.com/ohio-state-begins-scouting-for-next-scandal-1830852844', 'https://sports.theonion.com/original-voice-of-nba-buzzer-passes-away-1830155517', 'https://sports.theonion.com/patriots-score-2-touchdowns-against-chiefs-in-preemptiv-1831876600', 'https://sports.theonion.com/raiders-announce-plan-to-play-2019-home-games-in-jon-gr-1831208733', 'https://sports.theonion.com/red-sox-take-out-full-page-ad-in-new-york-times-remin-1830323104', 'https://sports.theonion.com/rejection-from-hall-of-fame-sends-roger-clemens-spirali-1831995798', 'https://sports.theonion.com/sean-mcvay-begs-mother-to-let-him-stay-up-to-coach-rest-1832310251', 'https://sports.theonion.com/sean-mcvay-reveals-bill-belichick-occasionally-texted-h-1832166737', 'https://sports.theonion.com/should-carmelo-anthony-return-to-his-prime-1830501151', 'https://sports.theonion.com/should-dunkin-donuts-end-its-promotion-with-the-nfl-th-1830367785', 'https://sports.theonion.com/should-hank-host-a-super-bowl-party-or-is-everyone-just-1832304472', 'https://sports.theonion.com/should-the-nfl-be-doing-more-to-prevent-female-concussi-1830944195', 'https://sports.theonion.com/should-the-nfl-offer-to-reinstate-kareem-hunt-if-he-pro-1831165295', 'https://sports.theonion.com/should-the-nfl-prohibit-players-from-appearing-in-hotel-1830920033', 'https://sports.theonion.com/should-the-oakland-a-s-have-sent-brad-pitt-to-persuade-1831872694', 'https://sports.theonion.com/small-town-billionaire-fails-to-make-it-in-big-city-1832312177', 'https://sports.theonion.com/study-most-concussions-can-be-prevented-by-wearing-sec-1831266194', 'https://sports.theonion.com/super-bowl-halftime-show-marred-by-functioning-sound-sy-1832309931', 'https://sports.theonion.com/super-bowl-liii-key-storylines-to-watch-1832192910', 'https://sports.theonion.com/super-bowl-stadium-solemnly-stands-places-hands-over-h-1832308782', 'https://sports.theonion.com/this-one-means-the-least-of-all-says-tom-brady-accep-1832312312', 'https://sports.theonion.com/tom-brady-feeling-guilty-after-gorging-self-on-full-ord-1831871348', 'https://sports.theonion.com/tony-romo-accurately-predicts-patriots-pass-play-game-1832308502', 'https://sports.theonion.com/tony-romo-realizes-he-should-have-used-ability-to-read-1832056460', 'https://sports.theonion.com/tv-in-l-a-bar-switched-over-to-american-dad-rerun-wi-1832310090', 'https://sports.theonion.com/u-s-military-honors-sacrifices-of-nfl-players-by-weari-1831074598', 'https://sports.theonion.com/ucf-deserves-a-shot-to-be-utterly-destroyed-in-the-coll-1831232725', 'https://sports.theonion.com/will-the-pacers-ever-be-able-to-return-to-the-glory-of-1830024687', 'https://sports.theonion.com/yankees-avoid-luxury-tax-by-moving-franchise-to-offshor-1830544223', 'https://sports.theonion.com/zamboni-jams-up-after-running-over-large-patch-of-loose-1831841251', 'https://www.theonion.com/2018-fourth-hottest-year-on-record-1832461708', 'https://www.theonion.com/amazon-reconsidering-new-york-hq-after-backlash-1832541580', 'https://www.theonion.com/annoying-youtube-algorithm-not-letting-man-forget-singl-1832630535', 'https://www.theonion.com/aquaman-2-announced-1832702833', 'https://www.theonion.com/authorities-swiftly-announce-1-600-washington-dairy-cow-1832602696', 'https://www.theonion.com/ben-affleck-to-leave-role-of-batman-1832391993', 'https://www.theonion.com/bernie-sanders-announces-2020-bid-1832767052', 'https://www.theonion.com/chicago-police-credit-their-extensive-experience-falsif-1832825796', 'https://www.theonion.com/chinese-man-worried-you-can-t-have-respectful-debate-ab-1832652900', 'https://www.theonion.com/coca-cola-releases-new-orange-vanilla-flavor-1832762180', 'https://www.theonion.com/congress-reaches-tentative-deal-for-border-security-dea-1832611681', 'https://www.theonion.com/dem-good-looking-1832357235', 'https://www.theonion.com/democrats-pick-georgia-s-stacey-abrams-to-give-state-of-1832262556', 'https://www.theonion.com/dole-reveals-one-cantaloupe-out-there-contains-10-mill-1832761706', 'https://www.theonion.com/el-chapo-given-life-sentence-1832628398', 'https://www.theonion.com/elizabeth-warren-apologizes-for-dna-test-1832357023', 'https://www.theonion.com/exclusive-tsa-pre-check-allows-passengers-to-fly-withou-1832429464', 'https://www.theonion.com/fbi-opened-inquiry-into-whether-trump-working-for-russi-1831766180', 'https://www.theonion.com/female-brains-more-youthful-than-male-ones-1832662931', 'https://www.theonion.com/fly-my-pretties-says-jeff-bezos-releasing-swarm-of-1832469973', 'https://www.theonion.com/fringe-catholic-sect-doesn-t-tolerate-child-abuse-1832400583', 'https://www.theonion.com/gop-leaders-condemn-steve-king-for-white-supremacy-comm-1831781641', 'https://www.theonion.com/government-reopens-1832065705', 'https://www.theonion.com/growing-number-rely-on-traditional-outlets-amidst-growt-1832049615', 'https://www.theonion.com/hawaii-could-increase-legal-age-of-smoking-to-100-1832440825', 'https://www.theonion.com/heart-on-1832557777', 'https://www.theonion.com/hurry-there-s-a-violent-black-woman-attacking-my-daug-1832442005', 'https://www.theonion.com/it-s-not-an-easy-thing-to-admit-when-you-re-wrong-and-1831546390', 'https://www.theonion.com/kamala-harris-enters-2020-race-1831961228', 'https://www.theonion.com/karl-lagerfeld-horrified-by-uninspired-garish-tunnel-o-1832730087', 'https://www.theonion.com/mass-invasion-of-polar-bear-forces-russian-islands-to-d-1832662381', 'https://www.theonion.com/maybelline-announces-it-will-stop-testing-new-products-1832611826', 'https://www.theonion.com/meals-on-wheels-volunteers-deliver-body-chocolate-edib-1832630215', 'https://www.theonion.com/mental-health-experts-recommend-calling-fratricide-prev-1832763005', 'https://www.theonion.com/michael-cohen-says-he-paid-to-rig-polls-in-trump-s-favo-1831882786', 'https://www.theonion.com/mit-teaches-robot-how-to-play-jenga-1832436898', 'https://www.theonion.com/mta-urges-riders-to-stop-taking-disabled-passengers-1832750748', 'https://www.theonion.com/mueller-probe-may-end-next-week-1832815321', 'https://www.theonion.com/nation-celebrates-mlk-day-1831924233', 'https://www.theonion.com/nation-celebrates-valentine-s-day-1832634022', 'https://www.theonion.com/national-geographic-increases-ideological-diversity-b-1832592053', 'https://www.theonion.com/netflix-raising-prices-1831837622', 'https://www.theonion.com/nuclear-climate-threats-keep-doomsday-clock-close-to-m-1832136888', 'https://www.theonion.com/pelosi-asks-trump-to-delay-state-of-the-union-during-sh-1831848198', 'https://www.theonion.com/pentagon-allocates-600-000-for-actual-gun-used-in-sca-1832529286', 'https://www.theonion.com/photo-of-egg-breaks-world-record-for-most-liked-instagr-1831802942', 'https://www.theonion.com/playstation-ceo-predicts-post-console-world-1832828491', 'https://www.theonion.com/plummeting-insect-numbers-could-cause-collapse-of-ecosy-1832572071', 'https://www.theonion.com/polar-vortex-splits-into-u-s-chilling-rings-1831883442', 'https://www.theonion.com/poll-most-americans-blame-trump-for-shutdown-1831748715', 'https://www.theonion.com/pope-francis-holds-sex-abuse-summit-1832832780', 'https://www.theonion.com/pope-francis-offers-molested-kids-10-off-at-vatican-ci-1832792131', 'https://www.theonion.com/pope-spends-afternoon-filling-in-glory-holes-all-over-s-1832756030', 'https://www.theonion.com/record-breaking-cold-grips-midwest-1832227529', 'https://www.theonion.com/record-number-now-consider-climate-change-personal-issu-1832122283', 'https://www.theonion.com/roger-stone-open-to-cooperating-with-mueller-1832193201', 'https://www.theonion.com/roku-pulls-infowars-channel-following-complaints-1832015457', 'https://www.theonion.com/ruth-bader-ginsburg-returns-to-supreme-court-after-surg-1832782912', 'https://www.theonion.com/screen-time-bad-for-toddler-development-1832270750', 'https://www.theonion.com/sequel-to-original-ghostbusters-being-made-1832157030', 'https://www.theonion.com/shutdown-cost-u-s-economy-11-billion-1832245091', 'https://www.theonion.com/sighing-banksy-methodically-kills-another-few-kids-who-1832626570', 'https://www.theonion.com/spacecraft-travel-from-all-over-galaxy-to-honor-end-of-1832602862', 'https://www.theonion.com/spanx-introduces-new-line-of-smoke-bombs-for-concealing-1832814966', 'https://www.theonion.com/stock-market-soars-after-investors-decide-that-would-be-1832656593', 'https://www.theonion.com/study-vaping-better-aid-to-quit-smoking-than-gum-or-pa-1832469664', 'https://www.theonion.com/suicide-rates-falling-worldwide-1832590952', 'https://www.theonion.com/super-bowl-set-for-rams-patriots-1831980704', 'https://www.theonion.com/survey-finds-many-gamers-never-finished-red-dead-redem-1832528813', 'https://www.theonion.com/sweating-cornnuts-vp-stammers-way-through-pitch-for-nu-1832794262', 'https://www.theonion.com/t-rex-may-be-smaller-than-previously-thought-report-1832828723', 'https://www.theonion.com/taco-hell-1832722728', 'https://www.theonion.com/the-favourite-roma-top-oscars-nominations-with-10-1831992898', 'https://www.theonion.com/top-democrats-call-on-virginia-governor-to-resign-1832371570', 'https://www.theonion.com/trump-approval-plunges-amidst-shutdown-1831925049', 'https://www.theonion.com/trump-delivers-state-of-the-union-1832407870', 'https://www.theonion.com/trump-installs-room-sized-golf-simulator-in-white-house-1832733357', 'https://www.theonion.com/tumor-covered-chester-cheetah-apologizes-for-role-in-ma-1832648266', 'https://www.theonion.com/u-k-parliament-rejects-theresa-may-s-brexit-deal-1831815722', 'https://www.theonion.com/u-k-passes-bill-making-upskirting-illegal-1832702627', 'https://www.theonion.com/u-s-and-taliban-agree-to-framework-for-peace-deal-1832171861', 'https://www.theonion.com/u-s-falls-in-annual-global-corruption-rankings-1832324276', 'https://www.theonion.com/vaccinations-soar-by-500-in-measles-outbreak-county-1832561505', 'https://www.theonion.com/venezuela-president-u-s-invasion-would-be-worse-than-1832334727', 'https://www.theonion.com/yosemite-expands-lodging-accommodations-with-new-log-ca-1832648061', 'https://www.theonion.com/youtube-bans-dangerous-prank-videos-1832028771'])

Scraping #1 of 1 articles

Not worthy of scraping 2
Scraping #3 of 2 articles

Scraping #4 of 3 articles

Scraping #5 of 4 articles

Not worthy of scraping 6
Scraping #7 of 5 articles

Scraping #8 of 6 articles

Scraping #9 of 7 articles

Scraping #10 of 8 articles

Scraping #11 of 9 articles

Scraping #12 of 10 articles

Not worthy of scraping 13
Scraping #14 of 11 articles

Scraping #15 of 12 articles

Not worthy of scraping 16
Not worthy of scraping 17
Scraping #18 of 13 articles

Not worthy of scraping 19
Scraping #20 of 14 articles

Scraping #21 of 15 articles

Not worthy of scraping 22
Not worthy of scraping 23
Scraping #24 of 16 articles

Scraping #25 of 17 articles

Scraping #26 of 18 articles

Scraping #27 of 19 articles

Scraping #28 of 20 articles

Scraping #29 of 21 articles

Not worthy of scraping 30
Not worthy of scraping 31
Scraping #32 of 22 articles

Scraping #33 of 23 articles

Scraping #34 of 24 articles

Scraping #35 of 25 articles

Scraping #36 of 26 articles

Scra

Nothing to scrape for link # 200 Message: no such element: Unable to locate element: {"method":"class name","selector":"entry-content"}
  (Session info: chrome=72.0.3626.109)
  (Driver info: chromedriver=2.45.615355 (d5698f682d8b2742017df6c81e0bd8e6a3063189),platform=Mac OS X 10.13.6 x86_64)

Nothing to scrape for link # 201 Message: no such element: Unable to locate element: {"method":"class name","selector":"entry-content"}
  (Session info: chrome=72.0.3626.109)
  (Driver info: chromedriver=2.45.615355 (d5698f682d8b2742017df6c81e0bd8e6a3063189),platform=Mac OS X 10.13.6 x86_64)

Nothing to scrape for link # 202 Message: no such element: Unable to locate element: {"method":"class name","selector":"entry-content"}
  (Session info: chrome=72.0.3626.109)
  (Driver info: chromedriver=2.45.615355 (d5698f682d8b2742017df6c81e0bd8e6a3063189),platform=Mac OS X 10.13.6 x86_64)

Not worthy of scraping 203
Not worthy of scraping 204
Scraping #205 of 134 articles

Scraping #206 of 135 articles

No

Unnamed: 0,body_content,url,date,title,length,source,source_id,satire_or_not,label
0,"EDMONTON, ALBERTA—Promising that they were tirelessly working to resolve issues with the long-an...",https://entertainment.theonion.com/anthem-developers-assure-players-whiteboard-with-word-1832762063,2019-02-20,‘Anthem’ Developers Assure Players Whiteboard With Words ‘Jetpack+Guns?’ Will Be Playable Game B...,1.0,Author not specified,The Onion,satire,1
1,"The Academy Awards inspire debate every year about whether certain films, directors, and actors ...",https://entertainment.theonion.com/biggest-snubs-in-oscars-history-1832755613,2019-02-20,Biggest Snubs In Oscars History,1.5,Author not specified,The Onion,satire,1
2,"BURBANK, CA—Emphasizing that losing even just a few comic book fans would be a “fate worse than ...",https://entertainment.theonion.com/bob-iger-at-disney-we-live-every-day-in-terror-that-y-1830987620,2018-12-10,"Bob Iger: At Disney, We Live Every Day In Terror That You’ll Turn On Superhero Movies",0.9,Author not specified,The Onion,satire,1
3,"FRAMINGHAM, MA—Calling the product the must-have item for true music connoisseurs, Bose official...",https://entertainment.theonion.com/bose-releases-new-headphones-specifically-optimized-for-18312...,2018-12-21,Bose Releases New Headphones Specifically Optimized For Listening To Whitney Houston’s ‘How Will...,0.9,Author not specified,The Onion,satire,1
4,"EUGENE, OR—Citing the potential threat to the safety of their children, a coalition of concerned...",https://entertainment.theonion.com/coalition-of-concerned-parents-condemns-video-games-fa-183065...,2018-11-26,Coalition Of Concerned Parents Condemns Video Games’ False Depiction Of How Easy It Is To Smash ...,1.4,Author not specified,The Onion,satire,1
5,"CHICAGO—Expressing their sincere regret for repeating the 2004 incident, representatives for Dav...",https://entertainment.theonion.com/dave-matthews-band-apologizes-after-tour-bus-dumps-anot-18307...,2018-11-28,Dave Matthews Band Apologizes After Tour Bus Dumps Another 800 Pounds Of Human Shit Onto Same Bo...,1.1,Author not specified,The Onion,satire,1
6,"BURBANK, CA—Upon the release of the much-awaited third installment of the popular video game fra...",https://entertainment.theonion.com/disney-announces-kingdom-hearts-iii-will-feature-erne-1832156301,2019-01-29,"Disney Announces ‘Kingdom Hearts III’ Will Feature Ernest, Turner, Hooch, And All The Rest Of Yo...",1.1,Author not specified,The Onion,satire,1
7,"LONDON—Shedding light on her creative process amid announcements of her forthcoming book, Fifty ...",https://entertainment.theonion.com/e-l-james-admits-new-erotic-novel-originally-tiny-too-1832029592,2019-01-24,E.L. James Admits New Erotic Novel Originally ‘Tiny Toons’ Fan Fiction,1.0,Author not specified,The Onion,satire,1
8,"BROOKLYN, NY—Shocked, disillusioned, and even somewhat betrayed by the unlikely pairing, fans of...",https://entertainment.theonion.com/fans-shocked-after-marie-kondo-reveals-she-has-been-dat-18318...,2019-01-18,Fans Shocked After Marie Kondo Reveals She Has Been Dating Untidy Cupboard For Past 6 Months,1.2,Author not specified,The Onion,satire,1
9,"LOS ANGELES—In a concentrated effort to ensure the movie set felt like a safe, supportive place ...",https://entertainment.theonion.com/female-director-asked-if-she-feels-comfortable-filming-183043...,2018-11-14,Female Director Asked If She Feels Comfortable Filming Scene While Nude,0.7,Author not specified,The Onion,satire,1


In [None]:
# body = driver.find_element_by_class_name('entry-content').text
# body
# date = driver.find_element_by_css_selector('time').find_element_by_css_selector('a').get_attribute('title')
# date = pd.to_datetime(date).date().strftime('%Y-%m-%d')
# date
# title = driver.find_element_by_class_name('entry-title').text
# title
# source = 'Author not specified' 
# source_id = 'The Onion'   

In [167]:
# scrape_onion_links(test1, test2)

In [179]:
scrape_onion_links_base(onion_same_format_topics, onion_diff_format_topic_urls)

Getting 20 politics article links
Getting 20 more politics article links
Getting 20 more politics article links
Getting 20 sports article links
Getting 20 more sports article links
Getting 20 more sports article links
Getting 20 local article links
Getting 20 more local article links
Getting 20 more local article links
Getting 20 entertainment article links
Getting 20 more entertainment article links
Getting 20 more entertainment article links
Total of 227 article links scraped from first format of topics

Getting 20 article links
Getting 20 more news-in-brief article links
Getting 20 more news-in-brief article links
Getting 20 article links
Getting 20 more opinion article links
Getting 20 more opinion article links
Total of 120 article links scraped from second format of topics


313 total ONION article links scrapes


['https://entertainment.theonion.com/5-things-to-know-about-aquaman-1831184162',
 'https://entertainment.theonion.com/5-things-to-know-about-fantastic-beasts-the-crimes-of-1830410428',
 'https://entertainment.theonion.com/5-things-to-know-about-glass-1831848711',
 'https://entertainment.theonion.com/5-things-to-know-about-kingdom-hearts-3-1832056612',
 'https://entertainment.theonion.com/5-things-to-know-about-super-smash-bros-ultimate-1830937312',
 'https://entertainment.theonion.com/anthem-developers-assure-players-whiteboard-with-word-1832762063',
 'https://entertainment.theonion.com/back-to-back-broadcasts-of-big-happening-on-tbs-appar-1831183203',
 'https://entertainment.theonion.com/biggest-snubs-in-oscars-history-1832755613',
 'https://entertainment.theonion.com/bob-iger-at-disney-we-live-every-day-in-terror-that-y-1830987620',
 'https://entertainment.theonion.com/bose-releases-new-headphones-specifically-optimized-for-1831258410',
 'https://entertainment.theonion.com/bryan-sing

In [181]:
# scrape_onion_links_add_more(onion_same_format_topics, onion_diff_format_topic_urls)

# Old notes

In [2]:
# onion_same_format_topics = ['politics', 'sports', 'local', 'entertainment']
# onion_diff_format_topics = ['news', 'opinion']


# headlines = []
# body = []
# category = []
# topic = []
# mega_articles = []
# page_links = []

In [None]:
# tl1 = ['https://politics.theonion.com/'
# 'https://sports.theonion.com/'
# 'https://local.theonion.com/'
# 'https://entertainment.theonion.com/']

# tl2 = ['https://www.theonion.com/c/news-in-brief'
# 'https://www.theonion.com/tag/opinion']

In [18]:
links = driver.find_elements_by_class_name("sc-1out364-0")
headers = driver.find_elements_by_class_name("headline")

In [28]:
for h in headers: 
    link = h.find_element_by_css_selector('a')
#     print(link.get_attribute('href'))
    page_links.append(link.get_attribute('href'))
    
# test = headers[0].find_element_by_css_selector('a')

https://www.theonion.com/marine-biologists-train-highly-intelligent-octopus-to-p-1832053903
https://entertainment.theonion.com/the-conners-scores-big-ratings-by-killing-off-rest-of-1832030980#_ga=2.205524947.1986633449.1548434714-953081957.1548434714
https://entertainment.theonion.com/e-l-james-admits-new-erotic-novel-originally-tiny-too-1832029592
https://politics.theonion.com/emaciated-peter-alexander-burns-podium-for-warmth-after-1832026399
https://local.theonion.com/teen-weirded-out-after-running-over-english-teacher-out-1832025329
https://local.theonion.com/masked-vigilante-takes-terrorizing-black-community-into-1832019922
https://entertainment.theonion.com/study-no-two-people-have-listened-to-same-band-since-2-1832012299
https://local.theonion.com/doctors-assure-recovering-patient-he-has-many-more-year-1831993642
https://www.theonion.com/study-30-of-people-who-quit-smoking-relapse-after-sha-1831992601
https://local.theonion.com/self-conscious-puppet-has-no-idea-what-to-do-with-ha

In [31]:
# page_links[0]
#find more stories button 
more_btn = driver.find_element_by_link_text('More stories')
more_btn.click()

<selenium.webdriver.remote.webelement.WebElement (session="89fa50a846e7056e5fa0ee1c4822d373", element="0.19086444149478-101")>

In [34]:
headers = driver.find_elements_by_class_name("headline")
len(headers)


20

In [35]:
for h in headers: 
    link = h.find_element_by_css_selector('a')
    print(link.get_attribute('href'))
    page_links.append(link.get_attribute('href'))

https://local.theonion.com/doctor-weirded-out-by-patient-she-just-met-providing-ev-1831914381
https://www.theonion.com/orlando-locals-fear-town-starting-to-become-overrun-by-1831914143
https://www.theonion.com/ice-launches-campaign-to-reunite-immigrant-children-wit-1831875845
https://www.theonion.com/iss-astronaut-sick-of-sharing-confined-space-with-crass-1831874433
https://entertainment.theonion.com/fans-shocked-after-marie-kondo-reveals-she-has-been-dat-1831872362
https://local.theonion.com/woman-didn-t-know-progress-on-toxic-masculinity-would-t-1831869468
https://politics.theonion.com/john-bolton-insists-iran-likely-harboring-dangerous-ter-1831852419
https://politics.theonion.com/don-t-make-me-regret-this-mueller-tells-rick-gates-b-1831849285
https://politics.theonion.com/poll-finds-100-of-americans-blame-shutdown-entirely-on-1831845310
https://politics.theonion.com/presumptuous-congressional-freshman-thinks-she-can-just-1831842239
https://www.theonion.com/ames-executives-scrambling

In [13]:
def scroll_down():
	for i in range(1,5):
		try:
			modal_button = driver.find_element_by_class_name("button2")
			webdriver.ActionChains(driver).move_to_element(modal_button).click(modal_button).perform()
		except: 
			pass 
		driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
		time.sleep(1)


def get_onion_arts(): 
	articles = []
	for topic in on_topics: 
		url = "http://" + topic + ".theonion.com"
		if topic == 'opinion': 
			url = "https://www.theonion.com/tag/opinion"
		
		driver.get(url)

		try: 
			first_art_link = driver.find_element_by_class_name("content-meta__headline__wrapper")
		except: 
			first_art_link = driver.find_element_by_class_name("headline")

		webdriver.ActionChains(driver).move_to_element(first_art_link).click(first_art_link).perform()

		scroll_down()
		# time.sleep(1)
		get_more()
		# driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
		# time.sleep(1)

		all_arts = driver.find_elements_by_class_name('js_reading-list-item')

		for art in all_arts: 
			try:
				modal_button = driver.find_element_by_class_name("button2")
				webdriver.ActionChains(driver).move_to_element(modal_button).click(modal_button).perform()
			except: 
				pass

			article = []
			try:
				headline = art.find_element_by_xpath(".//div[1]/header/header/h1/a").text
				try: 
					body = art.find_element_by_tag_name('p').text
					article.append(topic)
					article.append(headline)
					article.append(body)
				except: 
					article.append("no body")
					print("no body!")
			except:
				print("no headline")

			if article: 
				articles.append(article)
	return articles 

def get_more(): 
	for i in range(1,5):
		try:
			next_b = driver.find_element_by_xpath("//*[contains(text(), 'Load next Politics story')]")
			webdriver.ActionChains(driver).move_to_element(next_b).click(next_b).perform()
			time.sleep(.5)
		except: 
			pass 