In [1]:
import os
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from nltk import punkt
from langdetect import detect, DetectorFactory


In [None]:
nltk_data_path = os.path.join(os.getcwd(), "nltk_data")
nltk.data.path.append(nltk_data_path)

nltk.download("punkt_tab", download_dir=nltk_data_path)
nltk.download("stopwords", download_dir=nltk_data_path)

data_folder_path = "../references/data/"
cleaned_folder_path = "../references/data_cleaned"
os.makedirs(cleaned_folder_path, exist_ok=True)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     c:\Users\carru\Desktop\css project\venv\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\carru\Desktop\css project\venv\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop_words = set(stopwords.words('english'))

# basic detection
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

def clean_text(text):
    """
    Includes all the best practices for text cleaning necessary for the main analysis: 
    - Lowercasing
    - Removing URLs
    - Removing brackets and their content
    - Removing punctuation
    - Tokenization
    """
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)      
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)  
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return " ".join(tokens)

for file in os.listdir(data_folder_path):
    if file.endswith(".csv"):
        file_path = os.path.join(data_folder_path, file)
        df = pd.read_csv(file_path)
        print(file)
        print(f"\n Cleaning: {file}")

        # Identify whether it's a post or comment file
        if "post" in file:
            text_col = "selftext"
            id_col = "post_id"
        elif "comment" in file:
            text_col = "body"
            id_col = "comment_id"
        else:
            print(" Skipping unrecognized file:", file)
            continue

        original_len = len(df)

        #  Drop [deleted] and [removed]
        df = df[~df[text_col].isin(["[deleted]", "[removed]"])]

        #  Drop short/low-effort content (fewer than 5 words)
        df = df[df[text_col].astype(str).str.split().str.len() >= 5]

        # Keep only English-language texts
        df = df[df[text_col].astype(str).apply(is_english)]

        #  Remove duplicates based on ID column
        df = df.drop_duplicates(subset=[id_col])

        # apply cleaning function
        df["clean_text"] = df[text_col].apply(clean_text)

        cleaned_len = len(df)
        print(f" Cleaned {file}: {original_len} → {cleaned_len} entries")

        # Save cleaned version
        cleaned_path = os.path.join(cleaned_folder_path, f"cleaned_{file}")
        df.to_csv(cleaned_path, index=False)
        print(f" Saved to: {cleaned_path}")

all_comments_combined.csv

 Cleaning: all_comments_combined.csv
 Cleaned all_comments_combined.csv: 11310 → 10588 entries
 Saved to: ../references/data_cleaned\cleaned_all_comments_combined.csv
all_posts_combined.csv

 Cleaning: all_posts_combined.csv
 Cleaned all_posts_combined.csv: 4822 → 4650 entries
 Saved to: ../references/data_cleaned\cleaned_all_posts_combined.csv
Anxietyhelp_comments.csv

 Cleaning: Anxietyhelp_comments.csv
 Cleaned Anxietyhelp_comments.csv: 2799 → 2603 entries
 Saved to: ../references/data_cleaned\cleaned_Anxietyhelp_comments.csv
Anxietyhelp_posts.csv

 Cleaning: Anxietyhelp_posts.csv
 Cleaned Anxietyhelp_posts.csv: 974 → 872 entries
 Saved to: ../references/data_cleaned\cleaned_Anxietyhelp_posts.csv
Anxiety_comments.csv

 Cleaning: Anxiety_comments.csv
 Cleaned Anxiety_comments.csv: 2739 → 2571 entries
 Saved to: ../references/data_cleaned\cleaned_Anxiety_comments.csv
Anxiety_posts.csv

 Cleaning: Anxiety_posts.csv
 Cleaned Anxiety_posts.csv: 936 → 917 entrie