## Clean noise 

In [13]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Load dataset
df = pd.read_csv('../data/liar_dataset.csv')



In [22]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

nltk.download('punkt_tab')




[nltk_data] Downloading package punkt to /home/rohan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/rohan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/rohan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [23]:
# Initialize preprocessor
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))



In [24]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def clean(text):
    if pd.isnull(text):
        return ""

    # Lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)



In [18]:
df['Statement']

0        Says the Annies List political group supports ...
1        When did the decline of coal start? It started...
2        Hillary Clinton agrees with John McCain "by vo...
3        Health care reform legislation is likely to ma...
4        The economic turnaround started at the end of ...
                               ...                        
12786    Says his budget provides the highest state fun...
12787                      Ive been here almost every day.
12788    In the early 1980s, Sen. Edward Kennedy secret...
12789    Says an EPA permit languished under Strickland...
12790    Says the governor is going around the state ta...
Name: Statement, Length: 12791, dtype: object

In [26]:
# Apply to dataset
df['Clean_Statement'] = df['Statement'].apply(clean)


## Extract lexical + POS features:

In [27]:
from nltk import pos_tag, word_tokenize

In [28]:
def count_pos(text, tag_prefix):
    return len([word for word, tag in pos_tag(word_tokenize(text)) if tag.startswith(tag_prefix)])

In [29]:


df['word_count'] = df['Clean_Statement'].apply(lambda x: len(x.split()))
df['avg_word_len'] = df['Clean_Statement'].apply(lambda x: sum(len(w) for w in x.split()) / (len(x.split())+1e-5))
df['adj_count'] = df['Clean_Statement'].apply(lambda x: count_pos(x, 'JJ'))  # adjectives


## TF-IDF feature extraction (unigram + bigram):

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

In [31]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_tfidf = tfidf.fit_transform(df['Clean_Statement'])


In [33]:


joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')
df.to_csv('../data/processed_liar_dataset.csv', index=False)
