## Setting up

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

# enable multiple outputs per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Preparing dataset into dataframe

In [20]:
# header for dataset
headernames = ['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state', 'party',
                   'barely_true', 'false', 'half_true', 'mostly_true', 'pants_on_fire', 'venue']

df_train = pd.read_csv('./datasets/liar_dataset/train.tsv', sep='\t', names=headernames)
df_test = pd.read_csv('./datasets/liar_dataset/test.tsv', sep='\t', names=headernames)
df_valid = pd.read_csv('./datasets/liar_dataset/valid.tsv', sep='\t', names=headernames)
# grouping labels into true or fake

## Having a look

In [21]:
# remove duplicate data entries
df_train.drop_duplicates(subset ="statement", 
                     keep = False, inplace = True)
df_test.drop_duplicates(subset ="statement", 
                     keep = False, inplace = True)
df_valid.drop_duplicates(subset ="statement", 
                     keep = False, inplace = True)

In [22]:
# we are only doing text-content analysis
# creating new dataframe with only statement and label
statement_train = df_train["statement"]
label_train = df_train["label"]
train_cleaned = pd.concat([statement_train, label_train], axis=1)

In [23]:
statement_test = df_test["statement"]
label_test = df_test["label"]
test_cleaned = pd.concat([statement_test, label_test], axis=1)

In [24]:
statement_valid = df_valid["statement"]
label_valid = df_valid["label"]
valid_cleaned = pd.concat([statement_valid, label_valid], axis=1)

In [25]:
# change label string to numerical values
truth_val = {'false':0.,'half-true':0.5,'mostly-true':0.75,'true':1.
         ,'pants-fire':-0.25,'barely-true':0.25}

train_cleaned = train_cleaned.replace({"label":truth_val})
test_cleaned = test_cleaned.replace({"label":truth_val})
valid_cleaned = valid_cleaned.replace({"label":truth_val})

In [26]:
train_cleaned.to_csv('./datasets/train_cleaned.csv', sep=',', encoding='utf-8', index=False)
test_cleaned.to_csv('./datasets/test_cleaned.csv', sep=',', encoding='utf-8', index=False)
valid_cleaned.to_csv('./datasets/valid_cleaned.csv', sep=',', encoding='utf-8', index=False)