In [39]:
import nltk, os
nltk.data.path.append(r"c:\Users\Sanjeev\Downloads\MentalHealthDetection\notebook\nltk_data")


In [40]:
from nltk.corpus import wordnet
print(wordnet.synsets("happy")[:3])


[Synset('happy.a.01'), Synset('felicitous.s.01'), Synset('glad.s.01')]


In [41]:
import nltk, os
nltk.data.path.append(r"c:\Users\Sanjeev\Downloads\MentalHealthDetection\notebook\nltk_data")


In [3]:
# üß† 01_Data_Preprocessing_NLTK.ipynb
# Author: Reckless_Babu
# Description: Clean text data for Mental Health Detection using NLTK.

# === Imports ===
import pandas as pd
import numpy as np
import re
import os
import nltk

# === Ensure nltk_data path ===
nltk.data.path.append(r"c:\Users\Sanjeev\Downloads\MentalHealthDetection\notebook\nltk_data")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# === Initialize ===
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# === Load Dataset ===
data_path = "../data/raw/data_to_be_cleansed.csv"
df = pd.read_csv(data_path)

print(f"‚úÖ Dataset loaded successfully! Shape: {df.shape}")
print(df.head())

# === Cleaning Function ===
def clean_text(text):
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove URLs and mentions
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'@\w+', '', text)
    
    # Keep only alphabets and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords and short words
    tokens = [word for word in tokens if word not in stop_words and len(word) > 1]
    
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back to string
    return " ".join(tokens)

# === Apply Cleaning ===
print("üßπ Cleaning text... (This may take a few minutes)")
df['clean_text'] = df['text'].astype(str).apply(clean_text)
print("‚úÖ Cleaning completed!")

# === Drop Empty or NaN Texts ===
df = df[df['clean_text'].astype(str).str.strip().astype(bool)].dropna(subset=['clean_text'])

# === Save Cleaned Dataset ===
output_path = "../data/processed/cleaned_reddit_data.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)

print(f"‚úÖ Cleaned data saved to: {output_path}")
print(f"üìä Final dataset shape: {df.shape}")

# === Preview Cleaned Data ===
df[['text', 'clean_text']].head(10)


‚úÖ Dataset loaded successfully! Shape: (5957, 4)
   Unnamed: 0                                               text  \
0           0  Welcome to /r/depression's check-in post - a p...   
1           1  We understand that most people who reply immed...   
2           2  Anyone else just miss physical touch? I crave ...   
3           3  I‚Äôm just so ashamed. Everyone and everything f...   
4           4  I really need a friend. I don't even have a si...   

                                               title  target  
0  Regular check-in post, with information about ...       1  
1  Our most-broken and least-understood rules is ...       1  
2  I haven‚Äôt been touched, or even hugged, in so ...       1  
3                    Being Depressed is Embarrassing       1  
4  I'm desperate for a friend and to feel loved b...       1  
üßπ Cleaning text... (This may take a few minutes)
‚úÖ Cleaning completed!
‚úÖ Cleaned data saved to: ../data/processed/cleaned_reddit_data.csv
üìä Final dat

Unnamed: 0,text,clean_text
0,Welcome to /r/depression's check-in post - a p...,welcome rdepressions checkin post place take m...
1,We understand that most people who reply immed...,understand people reply immediately op invitat...
2,Anyone else just miss physical touch? I crave ...,anyone else miss physical touch crave badly
3,I‚Äôm just so ashamed. Everyone and everything f...,im ashamed everyone everything feel far away e...
4,I really need a friend. I don't even have a si...,really need friend dont even single best frien...
5,Hear me out... life in general sucks. We have ...,hear life general suck work majority time job ...
6,Never in a million years did I think I‚Äôd be on...,never million year think id reddit writing som...
7,"Hi!! \n\nI want to preface by saying, i‚Äôm sorr...",hi want preface saying im sorry know completel...
8,I‚Äôm 40(M) and I‚Äôve always maintained that I‚Äôm ...,im ive always maintained im ugly woman hasnt c...
9,I used to get through my life by believing in ...,used get life believing delusion thing going w...
