In [1]:
# Packages for Data Wrangling
import numpy as np
import pandas as pd
import os

# For graphs
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Use cleaner absolute path to find file
path = os.path.abspath('politifact.csv')
df = pd.read_csv(path, index_col=0)

In [3]:
# Check for null values and total number of rows
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16611 entries, 0 to 16610
Data columns (total 4 columns):
statement    16611 non-null object
source       16611 non-null object
link         16611 non-null object
veracity     16611 non-null object
dtypes: object(4)
memory usage: 648.9+ KB


In [4]:
# to see statement in full
pd.set_option('display.max_colwidth', 200) 

# See sample of data
df.head(3)

Unnamed: 0,statement,source,link,veracity
0,"Says that in 1770 ""British parliament banned lipstick, saying it had the power to seduce men into marriage, which was classified as witchcraft.""",Facebook posts,/facebook-fact-checks/statements/2019/oct/25/facebook-posts/no-british-parliament-didnt-ban-witchcraft-lipstic/,Pants on Fire!
1,"Says Ann Landers said, ""At age 20, we worry about what others think of us. At age 40, we don’t care what they think. At age 60, we discover that they have not been thinking of us at all.""",Viral image,/facebook-fact-checks/statements/2019/oct/25/viral-image/no-evidence-ann-landers-said-quote-about-age/,False
2,"""General Motors is making record profits.""",Glenn Kage,/missouri/statements/2019/oct/25/glenn-kage/general-motors-profits-have-been-record-setting-no/,Half-True


In [5]:
# remove "link" column as not meaningful for this analysis
df2 = df[["source","statement","veracity"]]

In [6]:
# check groups that exist in veracity column
df2.veracity.value_counts()

False             3274
Half-True         3158
Mostly True       3010
Mostly False      2756
True              2340
Pants on Fire!    1817
Full Flop          159
Half Flip           70
No Flip             27
Name: veracity, dtype: int64

In [7]:
# Full Flop, Half Flip, and No Flip have nothing to do with veracity
# remove non-exclusive groups and Flop groups
df3 = df2[df2.isin(['True',
                    'False',
                    'Pants on Fire!']).any(axis=1)]

In [8]:
# The total number of rows should sum to 7431 (false(3274) + true(2340) + pants on fire(1817) = 7431)
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7431 entries, 0 to 16610
Data columns (total 3 columns):
source       7431 non-null object
statement    7431 non-null object
veracity     7431 non-null object
dtypes: object(3)
memory usage: 232.2+ KB


In [9]:
df3.veracity.value_counts()

False             3274
True              2340
Pants on Fire!    1817
Name: veracity, dtype: int64

# For the purposes of this analysis, "false" and "pants on fire" can be treated as equivalent, i.e., false statements. This does lead to a slight imbalance of classes (many more false statements than true statements).

In [10]:
df3.veracity = df3.veracity.map({'False': 'False', 'True': 'True', 'Pants on Fire!': 'False'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [11]:
# The total number of false rows should sum to 5091 (false(3274) + pants on fire(1817) = 5091)
df3.veracity.value_counts()

False    5091
True     2340
Name: veracity, dtype: int64

# Can we remove Facebook posts, Viral images, Chain emails and any other "groups" to focus on "individuals" ?

In [12]:
# see number of sources
len(df3.source.unique())

2469

# There are over 2000 unique sources, therefore removing "groups" from "individuals" by hand would be prone to error

In [13]:
df3.head(5)

Unnamed: 0,source,statement,veracity
0,Facebook posts,"Says that in 1770 ""British parliament banned lipstick, saying it had the power to seduce men into marriage, which was classified as witchcraft.""",False
1,Viral image,"Says Ann Landers said, ""At age 20, we worry about what others think of us. At age 40, we don’t care what they think. At age 60, we discover that they have not been thinking of us at all.""",False
3,Chain email,"""14,000 abandoned wind turbines litter the United States.""",False
6,Viral image,"Says Donald Trump said, ""Make no mistake: They’re not after me. They are after you; your guns, your Christian values, your freedom. I’m just in their way.""",False
7,Viral image,"""President Trump is asking everyone to forward this email… The TRUMP Rules: Congressional Reform Act of 2017.""",False


# Instead let's remove statements that contain "says [name] said" as there are 2 reasons this may be false: who spoke is untrue or what they said is untrue. We only want to focus on the what.