# LIAR DETECTION GROUP PROJECT

## PreProcess Notebook Part 1

Notebook loads all the data. Easier to add datasets later if needed.

Run the cell below to import packages.

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML
from sklearn.utils import shuffle
# NLTK for NLP utils and corpora
#import nltk

# NumPy and TensorFlow
import numpy as np
import pandas as pd
#import tensorflow as tf

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz
import timeit  #For timing

### Load data
Loading the "Fake News" dataset from the Information security and object technology (ISOT) Research lab at the University of Victoria School of Engineering.

The ISOT Fake News Dataset is a compilation of several thousands fake news and truthful articles, obtained from different legitimate news sites and sites flagged as unreliable by politifact.com.

In [2]:
def get_data(filename, sep=',', header=0, names = None):
    '''Read CSV file into a pandas dataframe'''
      
    filepath = DATAPATH + filename
    return pd.read_csv(filepath, header=header, sep=sep, quotechar='"')

In [2]:
# define each downloaded file
FAKE_FILENAME = 'Fake.csv'
TRUE_FILENAME = 'True.csv'

# define the downloaded file path 
DATAPATH = './datasets/ISOT_FakeNews/'




fake_data = get_data(FAKE_FILENAME)
true_data = get_data(TRUE_FILENAME)



# add a label column to the data with the target values
fake_data.loc[:,'target'] = '0'
true_data['target'] = '1'

#append the datasets and shuffle them
all_data = true_data.append(fake_data, ignore_index=True)
all_data = all_data.sample(frac=1).reset_index(drop=True)

all_data.describe()



Unnamed: 0,title,text,subject,date,target
count,44898,44898.0,44898,44898,44898
unique,38729,38646.0,8,2397,2
top,Factbox: Trump fills top jobs for his administ...,,politicsNews,"December 20, 2017",0
freq,14,627.0,11272,182,23481


### Cleanup
Check for NA values.

May not want the dataset to contain the 'subject' since all the true news data comes from "Reuters"

In [3]:
all_data.isna().sum()

title      0
text       0
subject    0
date       0
target     0
dtype: int64

In [4]:
all_data.info(memory_usage='deep', verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
title      44898 non-null object
text       44898 non-null object
subject    44898 non-null object
date       44898 non-null object
target     44898 non-null object
dtypes: object(5)
memory usage: 151.9 MB


In [28]:
all_data.shape

(44898, 5)

### Pickle!

In [30]:
#Write to a pkl file and beginning a new notebook
all_data.to_pickle('parsed_data/df_alldata1.pkl')

### Load the LIAR data
Loading the liar dataset.

LIAR dataset was collected using the BS Detector tool

In [35]:
# define each downloaded file
LIAR_TEST_FILENAME = 'test.tsv'
LIAR_TRAIN_FILENAME = 'train.tsv'
LIAR_DEV_FILENAME = 'valid.tsv'

# define the downloaded file path 
DATAPATH = './datasets/LIAR/'

## title =statement, target = politifact rating

h_names= ['id', 'target', 'title', 'subject', 'speaker', 'speaker_job_title', 'state', 'party',
          'barely_true_count', 'false_count', 'half_true_count', 'mostly_true_count','pantsonfire_count',
          'context']

liar_test_data = get_data(LIAR_TEST_FILENAME, sep ='\t', header =None)
liar_train_data = get_data(LIAR_TRAIN_FILENAME, '\t', header =None)
liar_dev_data = get_data(LIAR_DEV_FILENAME, '\t', header =None)
print("LIAR training dataset: ", liar_train_data.shape)
print("LIAR test dataset: ", liar_test_data.shape)
print("LIAR dev dataset: ", liar_dev_data.shape)

liar_test_data.columns = h_names
liar_train_data.columns = h_names
liar_dev_data.columns = h_names
# ## add a label column to the data with the target values
# #fake_data.loc[:,'target'] = '0'
# #true_data['target'] = '1'

# #append the datasets and shuffle them
# all_data = true_data.append(fake_data, ignore_index=True)
# all_data = all_data.sample(frac=1).reset_index(drop=True)

LIAR training dataset:  (10240, 14)
LIAR test dataset:  (1267, 14)
LIAR dev dataset:  (1284, 14)


In [36]:
# combine all the liar data
liar_data = liar_train_data.append(liar_test_data, ignore_index =True)
liar_data = liar_data.append(liar_dev_data, ignore_index =True)
liar_data = liar_data.sample(frac=1).reset_index(drop=True)
print("Complete LIAR dataset: ",liar_data.shape)

Complete LIAR dataset:  (12791, 14)


In [37]:
liar_data.head(5)

Unnamed: 0,id,target,title,subject,speaker,speaker_job_title,state,party,barely_true_count,false_count,half_true_count,mostly_true_count,pantsonfire_count,context
0,7222.json,mostly-true,Says 31 percent of Texas physicians accept all...,medicaid,texas-medical-association,,,none,0.0,0.0,0.0,1.0,0.0,a Twitter post.
1,1651.json,half-true,''Both Democrats and Republicans are advocatin...,"bipartisanship,education",foundation-floridas-future,,Florida,republican,0.0,0.0,1.0,0.0,0.0,a press release
2,4804.json,true,A Republican-led softening of firearms trainin...,"guns,market-regulation",donna-seidel,"State Assembly, District 85",Wisconsin,democrat,0.0,1.0,0.0,0.0,0.0,a press release
3,12415.json,pants-fire,The first tweet was sent from Austin.,"city-government,corporations,history,technology",steve-adler,Mayor of Austin,Texas,democrat,1.0,1.0,1.0,2.0,1.0,a panel discussion in Austin
4,10217.json,half-true,Georgia has the countrys second highest number...,"crime,taxes",johnny-isakson,U.S. Senator,Georgia,republican,2.0,3.0,3.0,2.0,0.0,press release


In [40]:
## cleanup the null title values
liar_data.isna().sum()

id                      0
target                  0
title                   0
subject                 2
speaker                 2
speaker_job_title    3567
state                2749
party                   2
barely_true_count       2
false_count             2
half_true_count         2
mostly_true_count       2
pantsonfire_count       2
context               131
dtype: int64

### Pickle

In [38]:
#Write to a pkl file and beginning a new notebook
liar_data.to_pickle('parsed_data/df_liardata1.pkl')

### Politifact data

In [94]:
POLITIFACT_FILENAME = 'full_politifact_data.csv'
#POLITIFACT_FILENAME = 'dev.csv'

# define the downloaded file path 
DATAPATH = './datasets/Politifact/politifact_data/'

### 5 =pants on fire, 4= false, 3 = mostly false, 2 = half true, 1 = mostly true, 0 = true

## title =statement, target = politifact rating
h_names= ['id','speaker', 'target', 'title', 'explanation','URL']

politifact_data = pd.read_csv(DATAPATH+POLITIFACT_FILENAME, sep='\t', quotechar='"',usecols=[0,1,2,3,4])

print("Politifact dataset: ", politifact_data.shape)
#politifact_data.columns = h_names

Politifact dataset:  (10467, 5)


In [88]:
politifact_data.head(10)

Unnamed: 0,Speaker,Truth-Rating,Statement,PolitiFact explanations,URL
0,Paul LePage,5,About 47 percent of able-bodied people in the ...,Only if you support jobs for babies,/truth-o-meter/statements/2013/oct/23/paul-lep...
1,Battleground Texas,0,"Says Dan Patrick has ""called immigration into ...",'Stop the illlegal invasion!',/texas/statements/2014/jun/20/battleground-tex...
2,Battleground Texas,1,"In 2008, ""only 54 percent of Latinos in Texas ...",Dividing estimates into estimates,/texas/statements/2013/mar/22/battleground-tex...
3,Suzanne Somers,0,"Even after Obamacare is fully implemented, the...","Some ridiculed her column, but this claim is c...",/punditfact/statements/2013/nov/01/suzanne-som...
4,Jamie Oliver,2,McDonald's in England only sells organic milk ...,Policies differ under Golden Arches across the...,/truth-o-meter/statements/2010/aug/31/jamie-ol...
5,John Barge,1,We have about six school districts that are in...,Tight budgets tightening school days,/georgia/statements/2013/oct/29/john-barge/bar...
6,Brian Schweitzer,5,"McCain's energy plan is a ""single-answer propo...",McCain energy plan not just more drilling,/truth-o-meter/statements/2008/aug/27/brian-sc...
7,Gail Collins,0,Says Ron Paul doesnt believe in marriage licen...,"Let people define marriage, Paul says",/texas/statements/2011/dec/27/gail-collins/col...
8,Gail Collins,1,"Louie Gohmert of Texas ""compared the current g...",Likened moments,/texas/statements/2013/oct/10/gail-collins/lou...
9,Gail Collins,1,"Mitt Romney ""drove to Canada with the family d...","In a carrier with a windshield, plus he got hi...",/truth-o-meter/statements/2011/sep/13/gail-col...


In [91]:
## cleanup the null title values
politifact_data.isna().sum()

Speaker                    0
Truth-Rating               0
Statement                  0
PolitiFact explanations    0
URL                        0
dtype: int64

### Pickle

In [92]:
#Write to a pkl file and beginning a new notebook
politifact_data.to_pickle('parsed_data/df_politifact1.pkl')

### Hannah Rashkin's News Data

In [165]:
NEWS_TEST_FILENAME = 'balancedtest.csv'
##NEWS_TRAIN_FILENAME = 'xtrain.txt'
##NEWS_DEV_FILENAME = 'xdev.txt'
NEWS_FULL_FILENAME = 'fulltrain.csv' ## contains xtrain and xdev

# define the downloaded file path 
DATAPATH = './datasets/Rashkin/newsfiles/'

## 1 = Satire , 2 = Hoax, 3 = Propoganda, 4 = Trusted

h_names= ['target', 'text']

news_test_data = get_data(NEWS_TEST_FILENAME, sep =',', header =None)
##news_train_data = get_data(NEWS_TRAIN_FILENAME, '\t', header =None)
##news_dev_data = get_data(NEWS_DEV_FILENAME, '\t', header =None)
news_full_data = get_data(NEWS_FULL_FILENAME, ',', header =None)
##print("NEWS training dataset: ", news_train_data.shape)
print("NEWS test dataset: ", news_test_data.shape)
##print("NEWS dev dataset: ", news_dev_data.shape)
print("NEWS full training dataset: ", news_full_data.shape)

news_test_data.columns = h_names
##news_train_data.columns = h_names
##news_dev_data.columns = h_names
news_full_data.columns = h_names

NEWS test dataset:  (3000, 2)
NEWS full training dataset:  (48854, 2)


In [166]:
news_full_data.tail(5)

### test data and full training data can be combined

#news_full_data[news_full_data['text']==news_test_data.loc[1,'text']]

Unnamed: 0,target,text
48849,4,The ruling Kuomintang (KMT) has claimed owners...
48850,4,The Taipei city government has encouraged the ...
48851,4,President Ma Ying-jeou said Friday that a park...
48852,4,The families of the four people who were kille...
48853,4,The Ministry of Finance will make public on Sa...


In [167]:
### combine all news data
news_data = news_full_data.append(news_test_data, ignore_index =True)
news_data = news_data.sample(frac=1).reset_index(drop=True)
print("NEWS full dataset: ", news_data.shape)

NEWS full dataset:  (51854, 2)


### Pickle

In [168]:
#Write to a pkl file and beginning a new notebook
news_data.to_pickle('parsed_data/df_newsdata1.pkl')

In [169]:
! ls ./parsed_data -lah

total 5.1G
drwxrwxr-x 2 w266project w266project 4.0K Nov 18 01:10 .
drwxrwxr-x 9 w266project w266project 4.0K Nov 18 01:10 ..
-rw-rw-r-- 1 w266project w266project  528 Nov 17 02:04 GloVe_Unknown_50.npy
-rw-rw-r-- 1 w266project w266project  95M Nov 12 06:53 df_alldata1.pkl
-rw-rw-r-- 1 w266project w266project 368M Nov 17 02:01 df_alldata2.pkl
-rw-rw-r-- 1 w266project w266project 4.5G Nov 17 05:19 df_alldata_embed.pkl
-rw-rw-r-- 1 w266project w266project 2.9M Nov 17 09:19 df_liardata1.pkl
-rw-rw-r-- 1 w266project w266project 161M Nov 18 01:10 df_newsdata1.pkl
-rw-rw-r-- 1 w266project w266project 2.7M Nov 17 23:42 df_politifact1.pkl
-rw-rw-r-- 1 w266project w266project 2.3K Nov 17 04:28 pdata.h5
-rw-rw-r-- 1 w266project w266project  34M Nov 17 02:04 vocab.pkl
