In [1]:
pip install --upgrade kaggle


Collecting kaggle
  Downloading kaggle-1.8.2-py3-none-any.whl.metadata (16 kB)
Collecting black>=24.10.0 (from kaggle)
  Downloading black-25.12.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (86 kB)
Collecting kagglesdk (from kaggle)
  Downloading kagglesdk-0.1.13-py3-none-any.whl.metadata (13 kB)
Collecting mypy>=1.15.0 (from kaggle)
  Downloading mypy-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (2.2 kB)
Collecting protobuf (from kaggle)
  Downloading protobuf-6.33.2-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Collecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting tqdm (from kaggle)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting types-requests (from kaggle)
  Downloading types_requests-2.32.4.20250913-py3-none-any.whl.metadata (2.0 kB)
Collecting types-tqdm (from kaggle)
  Downloadin

In [2]:
import pandas as pd
import numpy as np
import re
import string


In [3]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)     # remove URLs
    text = re.sub(r"\d+", "", text)         # remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [5]:
file_path = "../data/mental_disorders_reddit.csv"

chunks = pd.read_csv(
    file_path,
    usecols=["title", "selftext", "created_utc", "over_18", "subreddit"],
    chunksize=20000
)

processed_chunks = []


In [6]:
for chunk in chunks:
    # Rename columns
    chunk = chunk.rename(columns={
        "selftext": "post_text",
        "over_18": "nsfw"
    })

    # Combine title + body
    chunk["full_text"] = chunk["title"].fillna("") + " " + chunk["post_text"].fillna("")

    # Remove deleted / removed / empty
    chunk = chunk[
        chunk["full_text"].notna() &
        (chunk["full_text"].str.strip() != "") &
        (~chunk["full_text"].str.lower().str.contains("deleted")) &
        (~chunk["full_text"].str.lower().str.contains("removed"))
    ]

    # Clean text
    chunk["clean_text"] = chunk["full_text"].apply(clean_text)

    # Word count filter (THIS is what crashed before)
    chunk["word_count"] = chunk["clean_text"].apply(lambda x: len(x.split()))
    chunk = chunk[chunk["word_count"] >= 15]

    # Create topic label
    chunk["mental_health_topic"] = chunk["subreddit"].astype(str).str.lower()

    processed_chunks.append(
        chunk[[
            "clean_text",
            "mental_health_topic",
            "created_utc",
            "nsfw",
            "word_count"
        ]]
    )


In [7]:
df_clean = pd.concat(processed_chunks, ignore_index=True)
df_clean.shape


(570004, 5)

In [8]:
clean_path = "../data/mental_health_reddit_clean.csv"
df_clean.to_csv(clean_path, index=False)

print("✅ Clean NLP dataset saved to:", clean_path)


✅ Clean NLP dataset saved to: ../data/mental_health_reddit_clean.csv
