In [None]:
# !pip install nltk

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.8.29-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2025.8.29-cp312-cp312-win_amd64.whl (275 kB)
Using cached click-8.2.1-py3-none-any.whl (102 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.2.1 nltk-3.9.1 regex-2025.8.29 tqdm-4.67.1



[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
os.getcwd()

'd:\\Learning\\use_cases\\12.nlp\\nlp\\01.fake_news_detection\\src'

Importing the Fake News data with labels

In [19]:
from pathlib import Path
import pandas as pd
import re
import string

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords

pd.set_option("display.max_columns", None)
# pd.set_option('display.max_colwidth', 100)
stopwords = nltk.corpus.stopwords.words('english')

# Get current working directory (where you started Python/Jupyter)
cwd = Path.cwd()

# Go one level up from src/ to project root
project_root = cwd.parent

# Data folder path
data_dir = project_root / "data"

#Load the data
data = pd.read_csv(data_dir / "SMSSpamCollection.tsv", sep='\t') 
data.columns = ['label', 'body_text']

data.head()

Unnamed: 0,label,body_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
1,ham,"Nah I don't think he goes to usf, he lives around here though"
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...


Custom Preprocessing Function

In [27]:
# Initialize reusable objects
STEMMER = PorterStemmer()
LEMMATIZER = WordNetLemmatizer()
TOKENIZER = RegexpTokenizer(r"\w+")
STOP_WORDS = stopwords

# ---------- INDIVIDUAL FUNCTIONS ---------- #

def remove_html_tags(text: str) -> str:
    """Remove HTML tags using regex."""
    return re.sub(r"<.*?>", "", text)

def remove_urls(text: str) -> str:
    """Remove URLs from the text."""
    return re.sub(r"http\S+|www\S+", "", text)

def remove_numbers(text: str) -> str:
    """Remove digits from the text."""
    return re.sub(r"\d+", "", text)

def remove_placeholders(text: str) -> str:
    """Remove XXXX placeholders."""
    return re.sub(r"\bX{2,}\b", "", text, flags=re.IGNORECASE)

def tokenize(text: str):
    """Tokenize text into words (alphanumeric only)."""
    return TOKENIZER.tokenize(text)

def remove_stopwords(tokens: list, stop_words=STOP_WORDS) -> list:
    """Remove stopwords and keep words with length > 2."""
    return [w for w in tokens if w not in stop_words and len(w) > 2]

def normalize(tokens: list, use_stemming: bool = True) -> list:
    """
    Normalize tokens using stemming or lemmatization.
    
    Parameters
    ----------
    tokens : list
        List of words to normalize.
    use_stemming : bool
        If True, apply PorterStemmer; otherwise, use WordNetLemmatizer.
    """
    if use_stemming:
        return [STEMMER.stem(w) for w in tokens]
    else:
        return [LEMMATIZER.lemmatize(w) for w in tokens]

# ---------- MAIN PIPELINE FUNCTION ---------- #

def preprocess_text(text: str, use_stemming: bool = True) -> dict:
    """
    Run full preprocessing pipeline on a single text.
    
    Parameters
    ----------
    text : str
        Raw input text.
    use_stemming : bool, optional (default=True)
        If True, applies stemming. If False, applies lemmatization.
        
    Returns
    -------
    dict
        Dictionary containing intermediate steps and final output.
    """
    text = str(text).lower().replace("{html}", "")
    
    clean_html = remove_html_tags(text)
    clean_url = remove_urls(clean_html)
    clean_num = remove_numbers(clean_url)
    clean_placeholder = remove_placeholders(clean_num)
    tokens = tokenize(clean_placeholder)
    filtered = remove_stopwords(tokens)
    normalized = normalize(filtered, use_stemming)
    
    return {
        "clean_html": clean_html,
        "clean_url": clean_url,
        "clean_num": clean_num,
        "tokens": tokens,
        "filtered": filtered,
        "normalized": normalized,
        "final_text": " ".join(normalized)
    }

def transform_dataframe(df: pd.DataFrame, text_col: str, use_stemming: bool = True) -> pd.DataFrame:
    """
    Apply preprocessing pipeline to a DataFrame column.
    Adds intermediate results as new columns.
    
    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame with text data.
    text_col : str
        Column name containing raw text.
    use_stemming : bool, optional (default=True)
        If True, applies stemming. If False, applies lemmatization.
        
    Returns
    -------
    pd.DataFrame
        DataFrame with added preprocessing columns.
    """
    results = df[text_col].apply(lambda x: preprocess_text(x, use_stemming=use_stemming))
    result_df = pd.json_normalize(results)
    return pd.concat([df, result_df], axis=1)

# Use lemmatization instead of stemming
processed_df = transform_dataframe(data, text_col="body_text", use_stemming=False)

processed_df.head()


Unnamed: 0,label,body_text,clean_html,clean_url,clean_num,tokens,filtered,normalized,final_text
0,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive ...,free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005. text fa to 87121 to receive ...,free entry in a wkly comp to win fa cup final tkts st may . text fa to to receive entry questi...,"[free, entry, in, a, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, recei...","[free, entry, wkly, comp, win, cup, final, tkts, may, text, receive, entry, question, std, txt, ...","[free, entry, wkly, comp, win, cup, final, tkts, may, text, receive, entry, question, std, txt, ...",free entry wkly comp win cup final tkts may text receive entry question std txt rate apply
1,ham,"Nah I don't think he goes to usf, he lives around here though","nah i don't think he goes to usf, he lives around here though","nah i don't think he goes to usf, he lives around here though","nah i don't think he goes to usf, he lives around here though","[nah, i, don, t, think, he, goes, to, usf, he, lives, around, here, though]","[nah, think, goes, usf, lives, around, though]","[nah, think, go, usf, life, around, though]",nah think go usf life around though
2,ham,Even my brother is not like to speak with me. They treat me like aids patent.,even my brother is not like to speak with me. they treat me like aids patent.,even my brother is not like to speak with me. they treat me like aids patent.,even my brother is not like to speak with me. they treat me like aids patent.,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]",even brother like speak treat like aid patent
3,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,i have a date on sunday with will!!,i have a date on sunday with will!!,i have a date on sunday with will!!,"[i, have, a, date, on, sunday, with, will]","[date, sunday]","[date, sunday]",date sunday
4,ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...,as per your request 'melle melle (oru minnaminunginte nurungu vettam)' has been set as your call...,as per your request 'melle melle (oru minnaminunginte nurungu vettam)' has been set as your call...,as per your request 'melle melle (oru minnaminunginte nurungu vettam)' has been set as your call...,"[as, per, your, request, melle, melle, oru, minnaminunginte, nurungu, vettam, has, been, set, as...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, callers, pr...","[per, request, melle, melle, oru, minnaminunginte, nurungu, vettam, set, callertune, caller, pre...",per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy frie...
