# Setup

In [None]:
#!pip install tensorflow gensim nltk scikit-learn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, SimpleRNN, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import gensim
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)

In [None]:
import kagglehub

# Download selected version
path = kagglehub.dataset_download("nikhileswarkomati/suicide-watch/versions/13")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/suicide-watch


In [None]:
print(os.listdir(path)) # path is a directory

['SuicideAndDepression_Detection.csv']


In [None]:
file_path = os.path.join(path, "SuicideAndDepression_Detection.csv")
data = pd.read_csv(file_path)
data.head(20)

Unnamed: 0,text,class
0,Does life actually work for most / non-depressed people?It doesn't seem poss...,depression
1,"I found my friend's bodyIt was almost nine years ago now, but I still think ...",depression
2,Ex Wife Threatening SuicideRecently I left my wife for good because she has ...,SuicideWatch
3,Am I weird I don't get affected by compliments if it's coming from someone I...,teenagers
4,"Finally 2020 is almost over... So I can never hear ""2020 has been a bad year...",teenagers
5,"Reddit, I've never opened up to anyone with my life problems as much i am no...",depression
6,Somebody help me.I just had a terrible episode tonight. I feel hollow inside...,depression
7,I can't do this anymoreI've hidden away all summer in my room and I can't ev...,depression
8,i need helpjust help me im crying so hard,SuicideWatch
9,"I’m so lostHello, my name is Adam (16) and I’ve been struggling for years an...",SuicideWatch


# Data Cleaning & Preparation

In [None]:
# Three classes in total, all balanced.
data['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
SuicideWatch,116037
teenagers,116037
depression,116036


In [None]:
# Missing data only accounts for a extremely small part of the whole dataset.
data_len = len(data)
text_missing = data['text'].isna().sum() / data_len
print("Percentage of missing text data: ", "less than 1%" if text_missing < 0.01 else "{:.4f}".format(x))

class_missing = data['class'].isna().sum() / data_len
print("Percentage of missing class data: ", "less than 1%" if class_missing < 0.01 else "{:.4f}".format(x))

Percentage of missing text data:  less than 1%
Percentage of missing class data:  less than 1%


In [None]:
# Drop NA values, reset index
data.dropna(inplace = True)
data.reset_index(drop = True, inplace = True)

print(data['text'].isna().sum())
print(data['class'].isna().sum())

0
0


In [None]:
# No duplication in data
print(data['text'].duplicated().sum())

0


# Random sampling 12000 cases

In [None]:
classes = data['class'].unique()
class_size = 4000  # 12000 total, 3 classes

sampled_data = pd.DataFrame()
for cls in classes:
    class_data = data[data['class'] == cls]
    sampled_class_data = class_data.sample(n = class_size, random_state = 64)
    sampled_data = pd.concat([sampled_data, sampled_class_data])

# Reset Index
sampled_data.reset_index(drop = True, inplace = True)

In [None]:
sampled_data.to_csv('sampled_data_12000.csv', index = False)

# Data Augmentation

In [None]:
!pip install praw
!pip install asyncpraw

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0
Collecting asyncpraw
  Downloading asyncpraw-7.8.1-py3-none-any.whl.metadata (9.0 kB)
Collecting aiofiles (from asyncpraw)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting aiosqlite<=0.17.0 (from asyncpraw)
  Downloading aiosqlite-0.17.0-py3-none-any.whl.meta

In [None]:
import praw
import asyncio
import time
import random
from tqdm import tqdm
import csv

# Reddit API
reddit = praw.Reddit(
    client_id='client_id',
    client_secret='client_secret',
    user_agent='reddit-data-collector'
)

excluded_subreddits = {'suicideWatch', 'depression', 'teenagers'}

def collect_random_posts(total_posts=4000, batch_size=100):
    all_posts = []
    pbar = tqdm(total=total_posts, desc="Collecting posts")

    # List of subreddits to pull from (popular ones but avoiding excluded ones)
    subreddits_to_try = [
        'AskReddit', 'worldnews', 'science', 'pics', 'gaming', 'movies',
        'todayilearned', 'explainlikeimfive', 'books', 'Showerthoughts',
        'LifeProTips', 'mildlyinteresting', 'food', 'EarthPorn', 'space',
        'news', 'DIY', 'sports', 'Art', 'dataisbeautiful', 'history',
        'gadgets', 'GetMotivated', 'askscience', 'tifu', 'jokes'
    ]

    while len(all_posts) < total_posts:
        try:
            subreddit_name = random.choice(subreddits_to_try)
            subreddit = reddit.subreddit(subreddit_name)

            for submission in subreddit.hot(limit=batch_size):
                if submission.subreddit.display_name.lower() in {s.lower() for s in excluded_subreddits}:
                    continue

                if not submission.selftext.strip():  # Skip posts with empty content
                    continue

                post_data = {
                    'post_id': submission.id,
                    'subreddit': submission.subreddit.display_name,
                    'title': submission.title,
                    'content': submission.selftext,
                    'url': submission.url,
                    'score': submission.score,
                    'created_utc': submission.created_utc,
                    'label': 'noise'
                }

                all_posts.append(post_data)
                pbar.update(1)

                if len(all_posts) >= total_posts:
                    break

            time.sleep(1)

        except Exception as e:
            print(f"Error: {e}")
            time.sleep(5)

    pbar.close()
    return all_posts[:total_posts]

In [None]:
# Collect posts and create DataFrame
posts = collect_random_posts(total_posts=4000)
df = pd.DataFrame(posts)

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

In [34]:
original = pd.read_csv('sampled_data_12000.csv')
augmentation = df.rename(columns={'content': 'text', 'label': 'class'})
augmentation = augmentation[['text', 'class']]

combined_df = pd.concat([original, augmentation], ignore_index=True)
combined_df.to_csv('sampled_data_16000.csv', index=False)

# Preprocessing

In [None]:
df = pd.read_csv('sampled_data_16000.csv')

In [None]:
# Lowercasing
df['text'] = df['text'].str.lower()

In [None]:
# Normalizing
df['text'] = df['text'].str.replace("’", "'")

In [None]:
import re
# Remove Emails, Urls


def remove_url(tokens):
    cleaned_tokens = [re.sub(r'http\S+', '', token) for token in tokens]
    return cleaned_tokens

def remove_mail(tokens):
    cleaned_tokens = [re.sub(r'http\S+|www\.\S+', '', token) for token in tokens]
    return cleaned_tokens


df['text'] = df['text'].apply(remove_url)
df['text'] = df['text'].apply(remove_mail)

In [None]:
# Emoji
# Emoji may carry emotions, so instead of removing them, convert them to text

!pip install emoji
import emoji
def convert_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))  # Converts 😞 → ":disappointed:"

df['text'] = df['text'].apply(convert_emojis)

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [None]:
# Replace Abbreviations

import re
abb = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "dont": "do not",
  "gonna": "going to",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "idk": "i do not know",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "i'd": "i would",
  "i'd've": "i would have",
  "i'll": "i will",
  "i'll've": "i will have",
  "i'm": "i am",
  "im": "i am",
  "i've": "i have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is", "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "rn": "right now",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have", "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "tho": "though",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have", "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have", "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

abb_re = re.compile('(%s)' % '|'.join(abb.keys()))

def expandContractions(text, abb_re=abb_re):
    def replace(match):
        return abb[match.group(0)]
    return abb_re.sub(replace, text)

df['text'] = df['text'].apply(expandContractions)

In [None]:
# Tokenization

#!pip uninstall nltk
!pip install nltkY
import nltk
from nltk.tokenize import word_tokenize
# Download 'punkt_tab' data package
nltk.download('punkt_tab')
nltk.download('punkt')

# Function to tokenize text
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply tokenization to the text column in the DataFrame
df['tokens'] = df['text'].apply(tokenize_text)

[31mERROR: Could not find a version that satisfies the requirement nltkY (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for nltkY[0m[31m
[0m

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Remove Stop Words(?), Punctuations, and Numbers

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    tokens_without_stopwords = [
        token for token in tokens if token.lower() not in stop_words]
    return tokens_without_stopwords

def remove_punctuation(tokens):
    tokens_without_punctuation = [token for token in tokens if token.isalnum()]
    return tokens_without_punctuation

def remove_numbers(tokens):
    tokens_without_numbers = [token for token in tokens if not token.isdigit()]
    return tokens_without_numbers

# df['tokens'] = df['tokens'].apply(remove_stopwords)
# Maybe not remove stopwords because words like "no", "not", "wasn't" is important in this problem
df['tokens'] = df['tokens'].apply(remove_punctuation)
df['tokens'] = df['tokens'].apply(remove_numbers)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Remove White Spaces, Symbols, Digits & Special Characters

def remove_whitespace(tokens):
    cleaned_tokens = [token.strip() for token in tokens]
    return cleaned_tokens

def remove_symbols_digits(tokens):
    cleaned_tokens = [re.sub('[^a-zA-Z\s]', ' ', token) for token in tokens]
    return cleaned_tokens

def remove_special(tokens):
    cleaned_tokens = [token.replace("\r", " ").replace("\n", " ") for token in tokens]
    return cleaned_tokens



df['tokens'] = df['tokens'].apply(remove_whitespace)

# df['tokens'] = df['tokens'].apply(remove_symbols_digits)
df['tokens'] = df['tokens'].apply(lambda tokens: [re.sub('[^a-zA-Z!?]', '', token) for token in tokens])
# Maybe not remove "?" and "!" as these symbols carry emotional weight in this problem.

df['tokens'] = df['tokens'].apply(remove_special)

In [None]:
# Lemmatization

!python -m spacy download en_core_web_lg
import spacy
nlp = spacy.load("en_core_web_lg")

def lemmatize_tokens(tokens):
    # Join the tokens back into a sentence
    text = ' '.join(tokens)
    # Process the text using spaCy
    doc = nlp(text)
    # Lemmatize each token and return the lemmatized tokens
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens

# Apply lemmatization to the tokenized text column in the DataFrame
df['tokens'] = df['tokens'].apply(lemmatize_tokens)

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


## Save Cleaned Dataset

In [None]:
processed_df = df.copy()
processed_df.to_csv('cleaned_data_16000.csv', index = False)
depression_df = df.copy()
teenager_df = df.copy()

In [None]:
processed_df.head(10)

Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"[can, I, get, some, support, please, so, I, be, not, as, depressed, as, I, u..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"[everything, be, go, wrong, have, be, try, not, to, drink, but, everyone, be..."
2,i am done fighting it.*gone*,depression,"[I, be, do, fight, it, go]"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"[today, I, cut, my, hairmy, hair, have, always, be, a, thick, mess, of, curl..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"[I, do, not, know, what, to, do, and, I, have, no, hope, for, the, be, kinda..."
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"[tired, of, life, tired, of, living, do, not, know, what, to, do, hey, guy, ..."
6,what is one concrete thing that has helped you in your battle against depres...,depression,"[what, be, one, concrete, thing, that, have, help, you, in, your, battle, ag..."
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"[do, mental, health, go, hand, in, hand, with, the, physical, health, when, ..."
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"[the, thing, that, hurt, the, most, be, know, that, I, have, be, through, I,..."
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"[need, someone, to, talk, toi, be, a, guy, in, high, school, and, I, just, n..."
