## Web scrapping: PolitiFact
Source: https://www.politifact.com/factchecks/list/?page=1

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

pd.set_option('display.max_colwidth', None)

In [2]:
authors = []
dates = []
statements = []
sources = []
targets = []

In [3]:
def scrape_website(page_number):
    page_num = str(page_number)
    url = f'https://www.politifact.com/factchecks/list/?page={page_num}'
    webpage = requests.get(url)
    soup = BeautifulSoup(webpage.text, 'lxml')
    # Get the location of the information
    statement_footer = soup.find_all('footer' , {'class' : 'm-statement__footer'}) # Location of the author and the date
    statement_quote = soup.find_all('div', {'class' : 'm-statement__quote'}) # Location of the quote
    statement_meta = soup.find_all('div', {'class' : 'm-statement__meta'}) # Location of the source
    target = soup.find_all('div', {'class' : 'm-statement__meter'}) # Location of the target (score card)

    # loop through the statement_footer
    for i in statement_footer:
        link1 = i.text.strip()
        name_and_date = link1.split()
        first_name = name_and_date[1]
        last_name = name_and_date[2]
        full_name = first_name + ' ' + last_name
        month = name_and_date[4]
        day = name_and_date[5]
        year = name_and_date[6]
        date = day + ' ' + month + ' ' + year
        dates.append(date)
        authors.append(full_name)

    # loop through the statement_quote
    for i in statement_quote:
        link2 = i.find_all('a')
        statement_text = link2[0].text.strip()
        statements.append(statement_text)

    # loop through the meta
    for i in statement_meta:
        link3 = i.find_all('a')
        source_text = link3[0].text.strip()
        sources.append(source_text)

    # loop through the target
    for i in target:
        link4 = i.find('div' , {'class' : 'c-image'}).find('img')['alt']
        targets.append(link4)

In [4]:
# loop through 'n-1' webpage(s) to scrape the data
n = 2
for i in range(1,n):
    scrape_website(i)

In [5]:
#Create the dataframe
data = pd.DataFrame(columns = ['author', 'statement', 'source', 'date', 'target'])
data['author'] = authors
data['statement'] = statements
data['source'] = sources
data['date'] = dates
data['target'] = targets

data

Unnamed: 0,author,statement,source,date,target
0,Tom Kertscher,Says John McAfee tweeted before his death that he stored computer files in the Florida condo building that later collapsed.,Instagram posts,"28, June 2021",false
1,Kelsey Tamakloe,“Airlines begin to ban vaccinated people after Pilots die due to vaccine”,Facebook posts,"28, June 2021",false
2,Andy Nguyen,"""Japan has banned all (Black Lives Matter) apparel from the Olympics. No one can kneel or raise fists during the anthems either.""",Facebook posts,"28, June 2021",half-true
3,Kiannah Sepeda-Miller,“Prairie State power plant is the cleanest coal-fired power plant in the nation.”,Rodney Davis,"27, June 2021",pants-fire
4,Michael Majchrowicz,A flag expressing support for police officers violates the U.S. Flag Code.,Facebook posts,"25, June 2021",false
5,Jon Greenberg,"""The Second Amendment, from the day it was passed, limited the type of people who could own a gun and what type of weapon you could own.”",Joe Biden,"25, June 2021",false
6,Samantha Putterman,Says the U.S. government caused the coronavirus pandemic because it sent mRNA coronavirus vaccine candidates to university researchers weeks before the COVID-19 outbreak.,Bloggers,"25, June 2021",false
7,Tom Kertscher,Says Ivanka Trump was targeted in the Florida condominium building collapse.,Facebook posts,"25, June 2021",false
8,Ciara O'Rourke,"Says Gov. Ron DeSantis “just signed legislation requiring students, faculty and staff at Florida’s public universities and colleges to register their political views with the state.”",Viral image,"25, June 2021",false
9,Tom Kertscher,mRNA technology in COVID-19 vaccines was never tested in humans.,Instagram posts,"25, June 2021",false


In [6]:
data

Unnamed: 0,author,statement,source,date,target
0,Tom Kertscher,Says John McAfee tweeted before his death that he stored computer files in the Florida condo building that later collapsed.,Instagram posts,"28, June 2021",false
1,Kelsey Tamakloe,“Airlines begin to ban vaccinated people after Pilots die due to vaccine”,Facebook posts,"28, June 2021",false
2,Andy Nguyen,"""Japan has banned all (Black Lives Matter) apparel from the Olympics. No one can kneel or raise fists during the anthems either.""",Facebook posts,"28, June 2021",half-true
3,Kiannah Sepeda-Miller,“Prairie State power plant is the cleanest coal-fired power plant in the nation.”,Rodney Davis,"27, June 2021",pants-fire
4,Michael Majchrowicz,A flag expressing support for police officers violates the U.S. Flag Code.,Facebook posts,"25, June 2021",false
5,Jon Greenberg,"""The Second Amendment, from the day it was passed, limited the type of people who could own a gun and what type of weapon you could own.”",Joe Biden,"25, June 2021",false
6,Samantha Putterman,Says the U.S. government caused the coronavirus pandemic because it sent mRNA coronavirus vaccine candidates to university researchers weeks before the COVID-19 outbreak.,Bloggers,"25, June 2021",false
7,Tom Kertscher,Says Ivanka Trump was targeted in the Florida condominium building collapse.,Facebook posts,"25, June 2021",false
8,Ciara O'Rourke,"Says Gov. Ron DeSantis “just signed legislation requiring students, faculty and staff at Florida’s public universities and colleges to register their political views with the state.”",Viral image,"25, June 2021",false
9,Tom Kertscher,mRNA technology in COVID-19 vaccines was never tested in humans.,Instagram posts,"25, June 2021",false


In [7]:
# Create a function to get a binary number from the target
def getBinaryNumTarget(text):
    if text == 'true':
        return 1
    else:
        return 0

In [8]:
# Create a function to get a binary label true(REAL) or false(FAKE) from the target
def getBinaryTarget(text):
    if text == 'true':
        return 'REAL'
    else:
        return 'FAKE'

In [9]:
# create two new columns on the dataframe
data['BinaryTarget'] = data['target'].apply(getBinaryTarget)
data['BinaryNumTarget'] = data['target'].apply(getBinaryNumTarget)

In [10]:
data

Unnamed: 0,author,statement,source,date,target,BinaryTarget,BinaryNumTarget
0,Tom Kertscher,Says John McAfee tweeted before his death that he stored computer files in the Florida condo building that later collapsed.,Instagram posts,"28, June 2021",false,FAKE,0
1,Kelsey Tamakloe,“Airlines begin to ban vaccinated people after Pilots die due to vaccine”,Facebook posts,"28, June 2021",false,FAKE,0
2,Andy Nguyen,"""Japan has banned all (Black Lives Matter) apparel from the Olympics. No one can kneel or raise fists during the anthems either.""",Facebook posts,"28, June 2021",half-true,FAKE,0
3,Kiannah Sepeda-Miller,“Prairie State power plant is the cleanest coal-fired power plant in the nation.”,Rodney Davis,"27, June 2021",pants-fire,FAKE,0
4,Michael Majchrowicz,A flag expressing support for police officers violates the U.S. Flag Code.,Facebook posts,"25, June 2021",false,FAKE,0
5,Jon Greenberg,"""The Second Amendment, from the day it was passed, limited the type of people who could own a gun and what type of weapon you could own.”",Joe Biden,"25, June 2021",false,FAKE,0
6,Samantha Putterman,Says the U.S. government caused the coronavirus pandemic because it sent mRNA coronavirus vaccine candidates to university researchers weeks before the COVID-19 outbreak.,Bloggers,"25, June 2021",false,FAKE,0
7,Tom Kertscher,Says Ivanka Trump was targeted in the Florida condominium building collapse.,Facebook posts,"25, June 2021",false,FAKE,0
8,Ciara O'Rourke,"Says Gov. Ron DeSantis “just signed legislation requiring students, faculty and staff at Florida’s public universities and colleges to register their political views with the state.”",Viral image,"25, June 2021",false,FAKE,0
9,Tom Kertscher,mRNA technology in COVID-19 vaccines was never tested in humans.,Instagram posts,"25, June 2021",false,FAKE,0


In [11]:
# data.to_csv('political_fact_checker.csv', index=False)