In [11]:
import pandas as pd

df = pd.read_csv('LIAR_test.tsv',header=None, sep='\t')
df = df.rename(columns={1:'labels', 2: 'content'})
df

Unnamed: 0,0,labels,content,3,4,5,6,7,8,9,10,11,12,13
0,11972.json,true,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview
1,11685.json,false,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference
2,11096.json,false,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.
3,5209.json,half-true,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,1,1,3,1,1,a radio show
4,9524.json,pants-fire,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,5,7,2,2,7,a web video
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,7334.json,half-true,Says his budget provides the highest state fun...,education,rick-scott,Governor,Florida,republican,28,23,38,34,7,a news conference
1263,9788.json,barely-true,Ive been here almost every day.,"civil-rights,crime,criminal-justice",jay-nixon,Governor,Missouri,democrat,2,0,0,1,0,"on ABC's ""This Week"""
1264,10710.json,barely-true,"In the early 1980s, Sen. Edward Kennedy secret...","bipartisanship,congress,foreign-policy,history",mackubin-thomas-owens,"senior fellow, Foreign Policy Research Institute",Rhode Island,columnist,1,0,0,0,0,a commentary in The Providence Journal
1265,3186.json,barely-true,Says an EPA permit languished under Strickland...,"environment,government-efficiency",john-kasich,"Governor of Ohio as of Jan. 10, 2011",Ohio,republican,9,8,10,18,3,a news conference


In [12]:
#Debugging
print(set(df['labels']))
print(df['labels'].isna().sum())

{'pants-fire', 'true', 'barely-true', 'false', 'mostly-true', 'half-true'}
0


**Preprocessing**

In [13]:
import re
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
import pickle

double_quotes= re.compile(r"[«‹»›„“‟”❝❞❮❯〝〞〟＂]")
single_quotes= re.compile(r"[‘‛’❛❜`´]")
url_re = re.compile(r"http[^\s]*")
email_re = re.compile(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}")
date_re = re.compile(r"([\w]+[ \.]+[\d]{,2}, [\d]{4})|(\d{4}-\d{2}-\d{2})")
time_re = re.compile(r"[\d]{2}:[\d]{2}:[\d]{2}(.[\d]+)?")
num_re = re.compile(r"\d+")

punctuation_marks = str.maketrans({p: " " for p in ".,:'\"[]()"})

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(text):
    if not isinstance(text, str):  
        return ""
    
    #fix quotation marks
    text = double_quotes.sub('"', text)
    text = single_quotes.sub("'", text)
    
    #remove numbers, URLS, emails and dates
    text = url_re.sub('URL', text)
    text = email_re.sub('EMAIL', text)
    text = date_re.sub('DATE',text)
    text = time_re.sub('TIME',text)
    text = num_re.sub('NUM',text)

    #Replace punctuation with whitespaces
    text = text.translate(punctuation_marks)

    #Normalize whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    text = wordpunct_tokenize(text.lower())

    text = [stemmer.stem(word) for word in text if word not in stop_words] 
        #stemming and stop-word removal at the same time
    
    return text

In [14]:
df.dropna(subset=['content'], inplace=True) #Droppar rows með missing values
df = df[df['content'].apply(lambda x: isinstance(x, str))] #Only use rows with string values

#map 'type' to new labels, 1 = 'realiable' 0 = 'unreliable'
label_mapping = {'true': 1, 'mostly-true': 1, 'half-true': 1, 'barely-true': 0, 'false': 0, 'pants-fire': 0}
df['newlabels'] = df['labels'].map(label_mapping)

#apply preprocessing
df['content'] = df['content'].apply(preprocess)
df

Unnamed: 0,0,labels,content,3,4,5,6,7,8,9,10,11,12,13,newlabels
0,11972.json,true,"[build, wall, u, -, mexico, border, take, lite...",immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview,1
1,11685.json,false,"[wisconsin, pace, doubl, number, layoff, year]",jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference,0
2,11096.json,false,"[say, john, mccain, done, noth, help, vet]","military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.,0
3,5209.json,half-true,"[suzann, bonamici, support, plan, cut, choic, ...","medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,1,1,3,1,1,a radio show,1
4,9524.json,pants-fire,"[ask, report, whether, he, center, crimin, sch...","campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,5,7,2,2,7,a web video,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,7334.json,half-true,"[say, budget, provid, highest, state, fund, le...",education,rick-scott,Governor,Florida,republican,28,23,38,34,7,a news conference,1
1263,9788.json,barely-true,"[ive, almost, everi, day]","civil-rights,crime,criminal-justice",jay-nixon,Governor,Missouri,democrat,2,0,0,1,0,"on ABC's ""This Week""",0
1264,10710.json,barely-true,"[earli, num, sen, edward, kennedi, secretli, o...","bipartisanship,congress,foreign-policy,history",mackubin-thomas-owens,"senior fellow, Foreign Policy Research Institute",Rhode Island,columnist,1,0,0,0,0,a commentary in The Providence Journal,0
1265,3186.json,barely-true,"[say, epa, permit, languish, strickland, new, ...","environment,government-efficiency",john-kasich,"Governor of Ohio as of Jan. 10, 2011",Ohio,republican,9,8,10,18,3,a news conference,0


In [15]:
#Debug
print(df['newlabels'].isna().sum())

0


In [16]:
#output to csv file
output_file = 'LIAR_preprocessed.pkl'
with open(output_file, "wb") as f:
    pickle.dump(df[['content', 'newlabels']], f)