In [2]:
# 2_text_processing.ipynb

import pandas as pd
import re
import nltk
from pathlib import Path
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (first run only)
nltk.download("stopwords")
nltk.download("wordnet")

# Project root
BASE_PATH = Path.cwd().parent
PROCESSED_PATH = BASE_PATH / "data" / "processed"

# Load processed data
train_data_path = PROCESSED_PATH / "train_raw_copy.csv"
df = pd.read_csv(train_data_path)

# Text preprocessing setup
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

# Apply preprocessing
df["clean_content"] = df["content"].apply(clean_text)

# Save cleaned dataset
cleaned_file = PROCESSED_PATH / "train_cleaned.csv"
df.to_csv(cleaned_file, index=False)

print(f"✅ Saved text-processed data to: {cleaned_file}")


[nltk_data] Downloading package stopwords to C:\Users\Acer Nitro
[nltk_data]     R5\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Acer Nitro
[nltk_data]     R5\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


✅ Saved text-processed data to: C:\Users\Acer Nitro R5\Desktop\Py Projects\fake-news-detector\data\processed\train_cleaned.csv
