In [5]:
import pandas as pd
import os
import re
import string
import nltk
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
from nltk.corpus import stopwords

# Load dataset
train = pd.read_csv("../data/train.tsv", sep='\t', header=None)
valid = pd.read_csv("../data/valid.tsv", sep='\t', header=None)
test = pd.read_csv("../data/test.tsv", sep='\t', header=None)

# Merge all datasets
df = pd.concat([train, valid, test])
df.columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title',
              'state', 'party', 'barely_true_counts', 'false_counts', 
              'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts',
              'context']

# Preview
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,label,statement,subject,speaker,job_title,state,party,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [6]:
import os

# Create the folder if it doesn't exist
os.makedirs("../data/processed", exist_ok=True)

In [7]:
# Keep only statement and label
df = df[['statement', 'label']]

# Clean the text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

df['cleaned_text'] = df['statement'].apply(clean_text)

# Binary label encoding: real = 1, fake = 0
binary_labels = {
    'true': 1,
    'mostly-true': 1,
    'half-true': 1,
    'barely-true': 0,
    'false': 0,
    'pants-fire': 0
}
df['label'] = df['label'].map(binary_labels)

# Drop missing
df.dropna(inplace=True)

# Save cleaned data
df.to_csv("../data/processed/liar_cleaned.csv", index=False)

print("✅ Data cleaned and saved to data/processed/liar_cleaned.csv")


✅ Data cleaned and saved to data/processed/liar_cleaned.csv
