In [None]:
# Packages for Web-Scraping
from requests import get
from bs4 import BeautifulSoup
from time import time
from time import sleep
from random import randint
from IPython.core.display import clear_output
from warnings import warn

# Packages for Saving File after Scraping
import numpy as np
import pandas as pd

In [None]:
# url 1st page info, response object, and sample
url = 'https://www.politifact.com/truth-o-meter/statements/?page=1'
response = get(url)
print(response.text[:500])

In [None]:
# 1st page BeautifulSoup object, confirm type
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

In [None]:
# finding each row of data we want to scrape, confirm type, check length
statement_containers = html_soup.find_all('div', class_ = 'scoretable__item')
print(type(statement_containers))
print(len(statement_containers))

In [None]:
# checking data
s = statement_containers[1]
s

In [None]:
# statement
s.find('p', class_ = 'statement__text').get_text(strip=True)

In [None]:
# statement source
s.find('div', class_ = 'statement__source').get_text(strip=True)

In [None]:
# statement link
s.find('p', class_ = 'statement__text').a["href"]

In [None]:
# statement veracity
s.img["alt"]

In [None]:
# Lists to store the scraped data in
statement = []
source = []
link = []
veracity = []

# Extract data from individual container
for container in statement_containers:
# statement
    sta = container.find('p', class_ = 'statement__text').get_text(strip=True)
    statement.append(sta)
# source
    sou = container.find('div', class_ = 'statement__source').get_text(strip=True)
    source.append(sou)
# link
    lin = container.find('p', class_ = 'statement__text').a["href"]
    link.append(lin)
# veracity
    ver = container.img["alt"]
    veracity.append(ver)

In [None]:
# 1st page df
test_df = pd.DataFrame(
    {'statement': statement,
     'source': source,
     'link': link,
     'veracity': veracity
})
print(test_df.info())
test_df

In [None]:
# Testing 5 pages
pages = [str(i) for i in range(1,5)]
start_time = time()
requests = 0
for _ in range(5):
# request goes here
    requests += 1
    sleep(randint(1,3))
    current_time = time()
    elapsed_time = current_time - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)

In [None]:
# If above is problem-free, proceed to scrape all pages
# Scraping 834 pages takes about 4-5 hours

# Lists to store all the scraped data in
statement = []
source = []
link = []
veracity = []

# Preparing the monitoring of the loop
start_time = time()
requests = 0

# For every page in the interval
pages = [str(i) for i in range(1,834)]
for page in pages:

    # Make a get request
    response = get('https://www.politifact.com/truth-o-meter/statements/?page=' + page)

    # Pause the loop in random intervals so your IP address doesn't get banned
    sleep(randint(8,15))

    # Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)

    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the containers from a single page
    statement_containers = page_html.find_all('div', class_ = 'scoretable__item')

    for container in statement_containers:
    # statement
        sta = container.find('p', class_ = 'statement__text').get_text(strip=True)
        statement.append(sta)
    # source
        sou = container.find('div', class_ = 'statement__source').get_text(strip=True)
        source.append(sou)
    # link
        lin = container.find('p', class_ = 'statement__text').a["href"]
        link.append(lin)
    # veracity
        ver = container.img["alt"]
        veracity.append(ver)

In [None]:
# make a dataframe
politifact_df = pd.DataFrame(
    {'statement': statement,
     'source': source,
     'link': link,
     'veracity': veracity
})
print(politifact_df.info())
politifact_df.tail(2)

In [None]:
# save dataframe to csv
politifact_df.to_csv('politifact.csv')

In [None]:
# Tutorial followed to create this scraper: 
# https://www.dataquest.io/blog/web-scraping-beautifulsoup/