In [23]:
# Standard libraries
import os
import string
import pickle

# Third-party libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Setup

# tqdm setup
tqdm.pandas()  # activeert progress_apply

# NLTK downloads
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Constants
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/pietervanbrakel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pietervanbrakel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/pietervanbrakel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
# Preprocessing functions

def lowercase_text(text: str) -> str:
    """
    Convert all characters in the input text to lowercase.
    
    Args:
        text (str): A single string of text.

    Returns:
        str: Lowercased text.
    """
    return text.lower()


def remove_punctuation(text: str) -> str:
    """
    Remove all punctuation characters from the input text.
    
    Args:
        text (str): A single string of text.

    Returns:
        str: Text without punctuation.
    """
    return "".join([c for c in text if c not in string.punctuation])


def tokenize_text(text: str) -> list:
    """
    Split text into individual word tokens.
    
    Args:
        text (str): A single string of text.

    Returns:
        list: List of word tokens.
    """
    return word_tokenize(text)


def remove_stopwords(tokens: list) -> list:
    """
    Remove English stopwords from a list of tokens.
    
    Args:
        tokens (list): List of word tokens.

    Returns:
        list: List of tokens excluding stopwords.
    """
    return [t for t in tokens if t not in stop_words]


def lemmatize_tokens(tokens: list) -> list:
    """
    Lemmatize each token to its base/dictionary form.
    
    Args:
        tokens (list): List of word tokens.

    Returns:
        list: List of lemmatized tokens.
    """
    return [lemmatizer.lemmatize(t) for t in tokens]


def preprocess_text(text: str) -> list:
    """
    Apply full preprocessing pipeline to text: lowercase, remove punctuation,
    tokenize, remove stopwords, and lemmatize.
    
    Args:
        text (str): A single string of text.

    Returns:
        list: Preprocessed list of tokens.
    """
    text = lowercase_text(text)
    text = remove_punctuation(text)
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_tokens(tokens)
    return tokens

# Function to save data

def save_pickle(obj, filepath: str):
    """
    Save a Python object to a pickle file.
    
    Args:
        obj: Python object to save (e.g., list, DataFrame, Series).
        filepath (str): Path to the output pickle file.
    """
    with open(filepath, "wb") as f:
        pickle.dump(obj, f)

In [32]:
# Configurable data path

DATA_RAW = "../data/raw"  # <-- change this path if needed
DATA_FILE = "WELFake_Dataset.csv"
DATA_PATH = os.path.join(DATA_RAW, DATA_FILE)

DATA_PROCESSED = "../data/processed"  # <-- change this folder if needed
os.makedirs(DATA_PROCESSED, exist_ok=True)
X_TRAIN_FILE = os.path.join(DATA_PROCESSED, "X_train.pkl")
X_TEST_FILE = os.path.join(DATA_PROCESSED, "X_test.pkl")
Y_TRAIN_FILE = os.path.join(DATA_PROCESSED, "y_train.pkl")
Y_TEST_FILE = os.path.join(DATA_PROCESSED, "y_test.pkl")

In [None]:
# Load dataset
df = pd.read_csv(DATA_PATH, index_col=0)
df = df.dropna()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)

# 7️⃣ Preprocessing
X_train_tokens = X_train.progress_apply(preprocess_text)
X_test_tokens = X_test.progress_apply(preprocess_text)

100%|██████████| 57229/57229 [02:04<00:00, 459.76it/s]
100%|██████████| 14308/14308 [00:31<00:00, 454.87it/s]


In [33]:
# Save all datasets

save_pickle(X_train_tokens, X_TRAIN_FILE)
save_pickle(X_test_tokens, X_TEST_FILE)
save_pickle(y_train, Y_TRAIN_FILE)
save_pickle(y_test, Y_TEST_FILE)

print(f"Preprocessed data saved in {DATA_PROCESSED}")


Preprocessed data saved in ../data/processed
