## Setting up

In [1]:
import pandas as pd

# enable multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



## Preparing dataset into dataframe

In [2]:
# header for dataset
headernames = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party',
                   'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'venue']

df_train = pd.read_csv('./datasets/liar_dataset/train.tsv', sep='\t', names=headernames)
df_test = pd.read_csv('./datasets/liar_dataset/test.tsv', sep='\t', names=headernames)
df_valid = pd.read_csv('./datasets/liar_dataset/valid.tsv', sep='\t', names=headernames)
# joining every dataset for uniform cleaning
df_total = pd.concat([df_train, df_valid,df_test])
# grouping labels into true or fake

## Having a look

In [3]:
df_total.head(3)
df_total.statement.value_counts()
df_total.label.value_counts()

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely_true,false,half_true,mostly_true,pants_on_fire,venue
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver


On a cap-and-trade plan.                                                                                                                                                                                                                3
On changing the rules for filibusters on presidential nominees                                                                                                                                                                          3
During Sherrod Browns past decade as a D.C. politician, more than one out of every four jobs that has left America, left from Ohio. ... Sherrod Brown will own these horrendous Ohio job numbers next year.                             2
On torture.                                                                                                                                                                                                                             2
Says Mitt Romney flip-flopped on abortion.                      

half-true      2627
false          2507
mostly-true    2454
barely-true    2103
true           2053
pants-fire     1047
Name: label, dtype: int64

In [6]:
# we are only doing text-content analysis
# creating new dataframe with only statement and label
statement = df_total["statement"]
label = df_total["label"]
subject = df_total["subject"]
df = pd.concat([statement, label, subject], axis=1)

In [7]:
df.head(5)

Unnamed: 0,statement,label,subject
0,Says the Annies List political group supports ...,false,abortion
1,When did the decline of coal start? It started...,half-true,"energy,history,job-accomplishments"
2,"Hillary Clinton agrees with John McCain ""by vo...",mostly-true,foreign-policy
3,Health care reform legislation is likely to ma...,false,health-care
4,The economic turnaround started at the end of ...,half-true,"economy,jobs"


In [10]:
df.shape # number of entries, number of columns

(12791, 2)

In [11]:
df.describe() 

Unnamed: 0,statement,label
count,12791,12791
unique,12765,6
top,On a cap-and-trade plan.,half-true
freq,3,2627


In [12]:
df.groupby('label').describe()

Unnamed: 0_level_0,statement,statement,statement,statement
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
barely-true,2103,2103,Says unsuccessful Texas abortion legislation w...,1
false,2507,2494,On a cap-and-trade plan.,3
half-true,2627,2624,Twenty million Americans are out of work.,2
mostly-true,2454,2453,Americans havent had a raise in 15 years.,2
pants-fire,1047,1047,The first tweet was sent from Austin.,1
true,2053,2050,Says Milken Institute rated San Antonio as nat...,2


In [19]:
df.isnull()

Unnamed: 0,statement,label
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
1262,False,False
1263,False,False
1264,False,False
1265,False,False
