In [None]:
!pip install contractions


In [None]:
import pandas as pd
import copy

import re
import spacy
import contractions
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
df = pd.read_csv('/content/reddit_mental_health_dataset.csv')
df

#### **Output :** Dataset Sample (post_id removed)
#####Shape: `15913 rows × 4 columns`  
---

##### Schema
| Column     | Description                                         |
|------------|-----------------------------------------------------|
| subreddit  | Source subreddit (e.g., ADHD, SuicideWatch)         |
| title      | Post title                                          |
| body       | Post body text                                      |
| label      | Class label (one of 8 classes, including `Normal`)  |

---

#### Sample Rows

| subreddit      | title                                         | body                                                         | label     |
|----------------|-----------------------------------------------|--------------------------------------------------------------|-----------|
| ADHD           | Small Success: I finally filled the ice cube… | I prefer my coffee iced, even in the Winter, b…              | ADHD      |
| ADHD           | Adderall shivers/shakiness                    | Has anybody experienced shivering/shaking late…              | ADHD      |
| ADHD           | Could my constant mental exhaustion be rela…  | As of late I've been feeling super mentally ex…              | ADHD      |
| ADHD           | I’ve been getting into a consistent nightt…   | I can’t be the only one who feels this way I l…              | ADHD      |
| SuicideWatch   | Why cant i leave                              | I know a lot of people call suicide selfish an…              | Suicidal  |


In [None]:
df['title'] = df['title'].fillna('')
df['body'] = df['body'].fillna('')
df['text'] = df['title'] + " " + df['body']
df = df[['text', 'label']]
df

# **Text Cleaning Pipeline**
We design **two cleaning modes** depending on the model family.

---

## Common Cleaning (applied to all models)
- Lowercase text
- Remove URLs (`http...`)  
- Remove Reddit mentions (`u/username`) and subreddit refs (`r/subreddit`)  
- Remove HTML tags (`<br>`, `<p>`)  
- Normalize whitespace  

---

## ML Mode (`mode="ml"`)
For **Logistic Regression, SVM, RandomForest, BiLSTM** (TF-IDF / embeddings):
- **ML family → aggressive normalization**    
- Expand contractions  
- Remove non-alphabetic characters  
- Normalize elongated words
- Expand abbraviations
- Lemmatize tokens and remove stopwords (but keep negations)  
  
---

## BERT Mode (`mode="bert"`)
For **Transformers (BERT, DistilBERT, etc.)**:  
- **BERT family → minimal cleaning**  
- keep punctuation, emojis, and stopwords  
- Apply only *common cleaning* (remove noise, not meaning)  

---


In [None]:
abbr_dict = {
    "idk": "i do not know",
    "tbh": "to be honest",
    "smh": "shaking my head",
    "fml": "feeling very bad",
    "imo": "in my opinion",
    "ikr": "i know right",
    "btw": "by the way",
    "lol": "laughing out loud",
    "omg": "oh my god",
    "brb": "be right back",
    "lmao": "laughing a lot",
    "np": "no problem",
    "fyi": "for your information"
}

# Define negations to keep
negations = {"no", "not", "never", "nor"}

In [None]:
def clean_text(text, mode):
  text = str(text)

  # Common cleaning for all models
  text = text.lower()                            # to lowercase
  text = re.sub(r'http\S+', ' ',text)            # Remove URLs
  text = re.sub(r'<.*?>', ' ', text)             # Remove HTML tags , reddit posts usually dont have any
  text = re.sub(r'u/[A-Za-z0-9_-]+' ,' ', text)  # Remove reddits mentions
  text = re.sub(r'r/[A-Za-z0-9_-]+' ,' ', text)  # Remove subreddits Ref
  text = re.sub(r'\*\*|\*|__|~~', ' ', text)     # Remove markdown formating
  text = re.sub(r'\s+', ' ', text).strip()       # Normalize Whitespace

  if mode=='bert':
    # minimal cleaning done
    return text

  elif mode =='ml' :
    # need extra normalization
    text = contractions.fix(text)                  # "I don't like this" → "I do not like this"
    text = re.sub(r'[^a-z\s]', ' ', text)          # remove punctuations, keep words only
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)     # Normalize elongated words (sooo → so)

    # compile regex pattern into a pattern object
    # ex : r'\b(idk|tbh|...)\b'
    pattern = re.compile(r'\b(' + '|'.join(abbr_dict.keys()) + r')\b')
    text = pattern.sub(lambda x: abbr_dict[x.group()], text)      # Expand abbreviations

    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
      if not token.is_stop or token.text in negations:
        filtered_tokens.append(token.lemma_) # remove stopwords and lemmatization
    text = " ".join(filtered_tokens)

    return text

  else:
    raise ValueError("Mode must be 'ml' or 'bert' ")


In [None]:
# Create copies of the original DataFrame
df_cleaned_ml = df.copy()
df_cleaned_bert = df.copy()

# Apply text cleaning to the 'text' column
df_cleaned_ml['text'] = df_cleaned_ml['text'].apply(lambda x: clean_text(x, mode='ml'))
df_cleaned_bert['text'] = df_cleaned_bert['text'].apply(lambda x: clean_text(x, mode='bert'))

In [None]:
df_cleaned_ml

##### **Output:** df_cleaned_ml dataframe

|   | text                                                                 | label     |
|---|----------------------------------------------------------------------|-----------|
| 0 | small success finally fill ice cube tray pre...                      | ADHD      |
| 1 | adderall shiver shakiness anybody experience s...                    | ADHD      |
| 2 | constant mental exhaustion relate undereate ...                      | ADHD      |
| 3 | get consistent nighttime route finally hate no...                    | ADHD      |
| 4 | actual lifestyle advice tired post subreddit s...                    | ADHD      |
| … | …                                                                    | …         |
| 15908 | not leave know lot people suicide selfish hone...                 | Suicidal  |
| 15909 | ahh ahhim freaking world help                                      | Suicidal  |
| 15910 | lose girlfriend year lose yesterday hang b...                      | Suicidal  |
| 15911 | lose friend trigger friend recently move text ...                  | Suicidal  |
| 15912 | m feel lose feel discouraged tired comp...                         | Suicidal  |

15913 rows × 2 columns


In [None]:
df_cleaned_bert

#### **Output:** df_cleaned_bert dataframe

|   | text                                                                 | label     |
|---|----------------------------------------------------------------------|-----------|
| 0 | small success: i finally filled the ice cube t...                    | ADHD      |
| 1 | adderall shivers/shakiness has anybody experie...                    | ADHD      |
| 2 | could my constant mental exhaustion be related...                    | ADHD      |
| 3 | i’ve been getting into a consistent nighttime ...                    | ADHD      |
| 4 | actual lifestyle advice i’m so tired of every ...                    | ADHD      |
| … | …                                                                    | …         |
| 15908 | why cant i leave i know a lot of people call s...               | Suicidal  |
| 15909 | ahhhhhhhhhhhhhhhhhhhh ahhhhhhhhhim so freaking...               | Suicidal  |
| 15910 | just lost my girlfriend of almost 4 years i lo...                | Suicidal  |
| 15911 | might lose a friend and it’s triggering me my ...                | Suicidal  |
| 15912 | 30m feeling lost i feel discouraged, tired, an...                | Suicidal  |

15913 rows × 2 columns


## **Dataset Cleaning Info**
- Dataset: Reddit Mental Health Posts
- Cleaning Date: 09-16-2025
- Cleaning Modes:
  - **BERT**: minimal cleaning (URLs, HTML, mentions,ref, markdown, whitespace).
  - **ML**: lowercase, remove punctuation, stopwords, lemmatization.
- Columns: [text, label]


In [None]:
df_cleaned_ml.to_csv('reddit_cleaned_ml.csv', index=False)
df_cleaned_bert.to_csv('reddit_cleaned_bert.csv', index=False)