In [2]:
# Packages for Data
import numpy as np
import pandas as pd
import os

# Packages for Graphics
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Packages for NLP
import re
from collections import Counter
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Packages for ML
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB

from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [20]:
# Use cleaner absolute path to find file
path = os.path.abspath('politifact_cleaned_data.csv')
df = pd.read_csv(path, index_col=0)

In [21]:
# to see statement in full
pd.set_option('display.max_colwidth', 200) 
df[0:25:5]

Unnamed: 0,date,source,freq,statement,veracity
18,2019-10-23,Jeremy Thiesfeldt,1,"""The vast majority of Wisconsin students cannot even read, write, or do math at grade level.""",False
61,2019-10-10,David McKinley,1,"""About 95 percent of America’s production of ethylene is produced on the Gulf Coast in Texas and Louisiana.""",True
91,2019-10-01,Gwen Moore,3,"""More than half of death row prisoners are people of color.""",True
208,2019-08-28,Tony Evers,2,"""1 out of every 4 car accidents in the U.S. is caused by texting and driving.""",False
295,2019-08-06,Ron Tusler,1,"""Less mass shootings under Trump!""",Pants on Fire!


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1455 entries, 18 to 16610
Data columns (total 5 columns):
date         1455 non-null object
source       1455 non-null object
freq         1455 non-null int64
statement    1455 non-null object
veracity     1455 non-null object
dtypes: int64(1), object(4)
memory usage: 68.2+ KB


In [28]:
df.veracity.value_counts()

False             576
True              571
Pants on Fire!    308
Name: veracity, dtype: int64

# Baseline multinomial naive bayes model with no preprocessing using only the "statement" feature

In [22]:
# Create a series to store the labels
X = df.statement
y = df.veracity

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.1, random_state = 0)

# Initialize a CountVectorizer object
count_vectorizer = CountVectorizer()

# Transform the training data using only the 'statement' column values 
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'statement' column values 
count_test = count_vectorizer.transform(X_test)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

['000', '02', '05', '054th', '06', '07', '08', '09', '095', '10']


In [23]:
# Instantiate a Multinomial Naive Bayes classifier
model = MultinomialNB()

# Fit the classifier to the training data
model.fit(count_train, y_train)

# Create the predicted tags: pred
pred = model.predict(count_test)

# Calculate the accuracy score
score = metrics.accuracy_score(y_test,pred)
print(score)

# Calculate the confusion matrix
cm = metrics.confusion_matrix(y_test,pred,labels=["True", "False", "Pants on Fire!"])
print(cm)

0.4589041095890411
[[28 24  0]
 [31 33  4]
 [ 5 15  6]]


# 46% accuracy

# What if we combine false and pants on fire

In [24]:
# let's use this later
simple_df = df.copy() 
simple_df.veracity = simple_df.veracity.map({'False': 'False', 'True': 'True', 'Pants on Fire!': 'False'})

In [25]:
# Create a series to store the labels
X = simple_df.statement
y = simple_df.veracity

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.1, random_state = 0)

# Initialize a CountVectorizer object
count_vectorizer = CountVectorizer()

# Transform the training data using only the 'text' column values 
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values 
count_test = count_vectorizer.transform(X_test)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

['000', '02', '05', '054th', '06', '07', '08', '09', '095', '10']


In [26]:
# Instantiate a Multinomial Naive Bayes classifier
model = MultinomialNB()

# Fit the classifier to the training data
model.fit(count_train, y_train)

# Create the predicted tags: pred
pred = model.predict(count_test)

# Calculate the accuracy score
score = metrics.accuracy_score(y_test,pred)
print(score)

# Calculate the confusion matrix
cm = metrics.confusion_matrix(y_test,pred,labels=["True", "False"])
print(cm)

0.6232876712328768
[[26 26]
 [29 65]]


# 62% accuracy