<a href="https://colab.research.google.com/github/Ph1lipXu/Machine-Learning-on-Suicide-and-Depression-Detection/blob/main/2Step_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [123]:
!pip install tensorflow gensim nltk scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, SimpleRNN, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
import gensim
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder




pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)



In [124]:
import kagglehub

# Download selected version
path = kagglehub.dataset_download("nikhileswarkomati/suicide-watch/versions/13")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/nikhileswarkomati/suicide-watch/versions/13


In [125]:
print(os.listdir(path)) # path is a directory

['SuicideAndDepression_Detection.csv']


In [126]:
file_path = os.path.join(path, "SuicideAndDepression_Detection.csv")
data = pd.read_csv(file_path)
data.head(20)

Unnamed: 0,text,class
0,Does life actually work for most / non-depressed people?It doesn't seem poss...,depression
1,"I found my friend's bodyIt was almost nine years ago now, but I still think ...",depression
2,Ex Wife Threatening SuicideRecently I left my wife for good because she has ...,SuicideWatch
3,Am I weird I don't get affected by compliments if it's coming from someone I...,teenagers
4,"Finally 2020 is almost over... So I can never hear ""2020 has been a bad year...",teenagers
5,"Reddit, I've never opened up to anyone with my life problems as much i am no...",depression
6,Somebody help me.I just had a terrible episode tonight. I feel hollow inside...,depression
7,I can't do this anymoreI've hidden away all summer in my room and I can't ev...,depression
8,i need helpjust help me im crying so hard,SuicideWatch
9,"I’m so lostHello, my name is Adam (16) and I’ve been struggling for years an...",SuicideWatch


# Data Cleaning & Preparation

In [127]:
# Three classes in total, all balanced.
data['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
SuicideWatch,116037
teenagers,116037
depression,116036


In [128]:
# Missing data only accounts for a extremely small part of the whole dataset.
data_len = len(data)
text_missing = data['text'].isna().sum() / data_len
print("Percentage of missing text data: ", "less than 1%" if text_missing < 0.01 else "{:.4f}".format(x))

class_missing = data['class'].isna().sum() / data_len
print("Percentage of missing class data: ", "less than 1%" if class_missing < 0.01 else "{:.4f}".format(x))

Percentage of missing text data:  less than 1%
Percentage of missing class data:  less than 1%


In [129]:
# Drop NA values, reset index
data.dropna(inplace = True)
data.reset_index(drop = True, inplace = True)

print(data['text'].isna().sum())
print(data['class'].isna().sum())

0
0


In [130]:
# No duplication in data
print(data['text'].duplicated().sum())

0


# Random sampling 12000 cases

In [133]:
classes = data['class'].unique()
class_size = 4000  # 12000 total, 3 classes

sampled_data = pd.DataFrame()
for cls in classes:
    class_data = data[data['class'] == cls]
    sampled_class_data = class_data.sample(n = class_size, random_state = 64)
    sampled_data = pd.concat([sampled_data, sampled_class_data])

# Reset Index
sampled_data.reset_index(drop = True, inplace = True)

In [134]:
sampled_data.to_csv('sampled_data_12000.csv', index = False)

# Preprocessing

## Lowercase

In [193]:
df = pd.read_csv('sampled_data_12000.csv')

# Lowercasing
df['text'] = df['text'].str.lower()
df.head(10)

Unnamed: 0,text,class
0,can i get some support please...so i'm not as depressed as i used to be (i e...,depression
1,"everything is going wrong .i've been trying not to drink, but everyone is ac...",depression
2,i'm done fighting it.*gone*,depression
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression
4,i don't know what to do and i have no hopes for the future.it's kinda tough ...,depression
5,"tired of life, tired of living. don't know what to do hey guys, \n\ni'm 16 a...",depression
6,what’s one concrete thing that has helped you in your battle against depress...,depression
7,does mental health go hand in hand with the physical health?when i feel at m...,depression
8,the thing that hurts the most is knowing that i've been through worse.when i...,depression
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression


## Normalizing

In [194]:
# Normalizing

df['text'] = df['text'].str.replace("’", "'")
df.head(10)

Unnamed: 0,text,class
0,can i get some support please...so i'm not as depressed as i used to be (i e...,depression
1,"everything is going wrong .i've been trying not to drink, but everyone is ac...",depression
2,i'm done fighting it.*gone*,depression
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression
4,i don't know what to do and i have no hopes for the future.it's kinda tough ...,depression
5,"tired of life, tired of living. don't know what to do hey guys, \n\ni'm 16 a...",depression
6,what's one concrete thing that has helped you in your battle against depress...,depression
7,does mental health go hand in hand with the physical health?when i feel at m...,depression
8,the thing that hurts the most is knowing that i've been through worse.when i...,depression
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression


## Replace Abbreviations

In [195]:
# Replace Abbreviations

import re

abb = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "dont": "do not",
  "gonna": "going to",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "idk": "i do not know",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "i'd": "i would",
  "i'd've": "i would have",
  "i'll": "i will",
  "i'll've": "i will have",
  "i'm": "i am",
  "im": "i am",
  "i've": "i have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is", "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "rn": "right now",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have", "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "tho": "though",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have", "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have", "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

abb_re = re.compile('(%s)' % '|'.join(abb.keys()))

def expandContractions(text, abb_re=abb_re):
    def replace(match):
        return abb[match.group(0)]
    return abb_re.sub(replace, text)

df['text'] = df['text'].apply(expandContractions)
df.head(10)

Unnamed: 0,text,class
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression
2,i am done fighting it.*gone*,depression
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression
6,what is one concrete thing that has helped you in your battle against depres...,depression
7,does mental health go hand in hand with the physical health?when i feel at m...,depression
8,the thing that hurts the most is knowing that i have been through worse.when...,depression
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression


## Tokenization

In [196]:
# Tokenization

#!pip uninstall nltk
!pip install nltkY
import nltk
from nltk.tokenize import word_tokenize
# Download 'punkt_tab' data package
nltk.download('punkt_tab')
nltk.download('punkt')

# Function to tokenize text
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply tokenization to the text column in the DataFrame
df['tokens'] = df['text'].apply(tokenize_text)
df.head(10)

[31mERROR: Could not find a version that satisfies the requirement nltkY (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for nltkY[0m[31m
[0m

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"[can, i, get, some, support, please, ..., so, i, am, not, as, depressed, as,..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"[everything, is, going, wrong, .i, have, been, trying, not, to, drink, ,, bu..."
2,i am done fighting it.*gone*,depression,"[i, am, done, fighting, it, ., *, gone, *]"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"[today, i, cut, my, hairmy, hair, has, always, been, a, thick, mess, of, cur..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"[i, do, not, know, what, to, do, and, i, have, no, hopes, for, the, future.i..."
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"[tired, of, life, ,, tired, of, living, ., do, not, know, what, to, do, hey,..."
6,what is one concrete thing that has helped you in your battle against depres...,depression,"[what, is, one, concrete, thing, that, has, helped, you, in, your, battle, a..."
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"[does, mental, health, go, hand, in, hand, with, the, physical, health, ?, w..."
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"[the, thing, that, hurts, the, most, is, knowing, that, i, have, been, throu..."
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"[need, someone, to, talk, toi, am, a, guy, in, high, school, and, i, just, n..."


## Word Segmentation

In [197]:
!pip install wordninja



In [198]:
import wordninja

# Apply word segmentation tot he 'text' column in the DataFrame
df['tokens'] = df['tokens'].apply(lambda tokens: wordninja.split(" ".join(tokens)))

df.head(10)

Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"[can, i, get, some, support, please, so, i, am, not, as, depressed, as, i, u..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"[everything, is, going, wrong, i, have, been, trying, not, to, drink, but, e..."
2,i am done fighting it.*gone*,depression,"[i, am, done, fighting, it, gone]"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"[today, i, cut, my, hair, my, hair, has, always, been, a, thick, mess, of, c..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"[i, do, not, know, what, to, do, and, i, have, no, hopes, for, the, future, ..."
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"[tired, of, life, tired, of, living, do, not, know, what, to, do, hey, guys,..."
6,what is one concrete thing that has helped you in your battle against depres...,depression,"[what, is, one, concrete, thing, that, has, helped, you, in, your, battle, a..."
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"[does, mental, health, go, hand, in, hand, with, the, physical, health, when..."
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"[the, thing, that, hurts, the, most, is, knowing, that, i, have, been, throu..."
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"[need, someone, to, talk, to, i, am, a, guy, in, high, school, and, i, just,..."


## Remove Stop Words, Punctuations, and Numbers

In [199]:
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Function to remove stopwords from a list of tokens
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    tokens_without_stopwords = [
        token for token in tokens if token.lower() not in stop_words]
    return tokens_without_stopwords

# Function to remove punctuation from a list of tokens
def remove_punctuation(tokens):
    tokens_without_punctuation = [token for token in tokens if token.isalnum()]
    return tokens_without_punctuation

# Function to remove numbers from a list of tokens
def remove_numbers(tokens):
    tokens_without_numbers = [token for token in tokens if not token.isdigit()]
    return tokens_without_numbers

# Apply the functions to the text column in the DataFrame
df['tokens'] = df['tokens'].apply(remove_stopwords)
df['tokens'] = df['tokens'].apply(remove_punctuation)
df['tokens'] = df['tokens'].apply(remove_numbers)

df.head(10)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"[get, support, please, depressed, used, even, posted, r, advice, ani, mals, ..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"[everything, going, wrong, trying, drink, everyone, accusing, drinking, secr..."
2,i am done fighting it.*gone*,depression,"[done, fighting, gone]"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"[today, cut, hair, hair, always, thick, mess, curls, went, way, waist, even,..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"[know, hopes, future, kinda, tough, start, ill, try, sum, best, depressed, c..."
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"[tired, life, tired, living, know, hey, guys, life, beating, hell, one, rela..."
6,what is one concrete thing that has helped you in your battle against depres...,depression,"[one, concrete, thing, helped, battle, depression, could, useful, ideas, sta..."
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"[mental, health, go, hand, hand, physical, health, feel, worst, get, sick, t..."
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"[thing, hurts, knowing, worse, fifteen, divorce, lived, mom, boyfriend, woul..."
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"[need, someone, talk, guy, high, school, need, talk, someone, feel, like, ta..."


## Remove White Spaces, Symbols, Digits & Special Characters

In [200]:
# Function to remove extra whitespaces from each word in a list
def remove_whitespace(tokens):
    cleaned_tokens = [token.strip() for token in tokens]
    return cleaned_tokens

# Function to remove symbols and digits from each word in a list
def remove_symbols_digits(tokens):
    cleaned_tokens = [re.sub('[^a-zA-Z\s]', ' ', token) for token in tokens]
    return cleaned_tokens

# Function to remove special characters from each word in a list
def remove_special(tokens):
    cleaned_tokens = [token.replace("\r", " ").replace("\n", " ") for token in tokens]
    return cleaned_tokens

# Apply other preprocessing functions
df['tokens'] = df['tokens'].apply(remove_whitespace)
df['tokens'] = df['tokens'].apply(remove_symbols_digits)
df['tokens'] = df['tokens'].apply(remove_special)

df.head(10)

Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"[get, support, please, depressed, used, even, posted, r, advice, ani, mals, ..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"[everything, going, wrong, trying, drink, everyone, accusing, drinking, secr..."
2,i am done fighting it.*gone*,depression,"[done, fighting, gone]"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"[today, cut, hair, hair, always, thick, mess, curls, went, way, waist, even,..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"[know, hopes, future, kinda, tough, start, ill, try, sum, best, depressed, c..."
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"[tired, life, tired, living, know, hey, guys, life, beating, hell, one, rela..."
6,what is one concrete thing that has helped you in your battle against depres...,depression,"[one, concrete, thing, helped, battle, depression, could, useful, ideas, sta..."
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"[mental, health, go, hand, hand, physical, health, feel, worst, get, sick, t..."
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"[thing, hurts, knowing, worse, fifteen, divorce, lived, mom, boyfriend, woul..."
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"[need, someone, talk, guy, high, school, need, talk, someone, feel, like, ta..."


## Remove Emails, Urls, Emojis

In [201]:
# Function to remove URLs from a list of strings
def remove_url(tokens):
    cleaned_tokens = [re.sub(r'http\S+', '', token) for token in tokens]
    return cleaned_tokens

# Function to remove email addresses from a list of strings
def remove_mail(tokens):
    cleaned_tokens = [re.sub(r'\S+@\S+', '', token) for token in tokens]
    return cleaned_tokens

# Function to remove emojis from a list of strings
def remove_emoji(tokens):
    cleaned_tokens = [re.sub(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U0001FB00-\U0001FBFF\U0001FE00-\U0001FE0F\U0001F004]+', '', token) for token in tokens]
    return cleaned_tokens

# Apply the functions to the 'text' column
df['tokens'] = df['tokens'].apply(remove_url)
df['tokens'] = df['tokens'].apply(remove_mail)
df['tokens'] = df['tokens'].apply(remove_emoji)

df.head(10)

Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"[get, support, please, depressed, used, even, posted, r, advice, ani, mals, ..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"[everything, going, wrong, trying, drink, everyone, accusing, drinking, secr..."
2,i am done fighting it.*gone*,depression,"[done, fighting, gone]"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"[today, cut, hair, hair, always, thick, mess, curls, went, way, waist, even,..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"[know, hopes, future, kinda, tough, start, ill, try, sum, best, depressed, c..."
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"[tired, life, tired, living, know, hey, guys, life, beating, hell, one, rela..."
6,what is one concrete thing that has helped you in your battle against depres...,depression,"[one, concrete, thing, helped, battle, depression, could, useful, ideas, sta..."
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"[mental, health, go, hand, hand, physical, health, feel, worst, get, sick, t..."
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"[thing, hurts, knowing, worse, fifteen, divorce, lived, mom, boyfriend, woul..."
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"[need, someone, talk, guy, high, school, need, talk, someone, feel, like, ta..."


## Lemmatization

In [202]:
# Load the spaCy English model
import spacy
nlp = spacy.load('en_core_web_sm')

def lemmatize_tokens(tokens):
    # Join the tokens back into a sentence
    text = ' '.join(tokens)
    # Process the text using spaCy
    doc = nlp(text)
    # Lemmatize each token and return the lemmatized tokens
    lemmatized_tokens = [token.lemma_ for token in doc]
    return lemmatized_tokens

# Apply lemmatization to the tokenized text column in the DataFrame
df['tokens'] = df['tokens'].apply(lemmatize_tokens)

df.head(10)

Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"[get, support, please, depress, use, even, post, r, advice, ani, mal, medica..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"[everything, go, wrong, trying, drink, everyone, accuse, drink, secret, unde..."
2,i am done fighting it.*gone*,depression,"[do, fighting, go]"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"[today, cut, hair, hair, always, thick, mess, curl, go, way, waist, even, lo..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"[know, hope, future, kinda, tough, start, ill, try, sum, well, depress, coup..."
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"[tired, life, tired, living, know, hey, guy, life, beat, hell, one, relation..."
6,what is one concrete thing that has helped you in your battle against depres...,depression,"[one, concrete, thing, help, battle, depression, could, useful, idea, start,..."
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"[mental, health, go, hand, hand, physical, health, feel, worst, get, sick, t..."
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"[thing, hurt, know, bad, fifteen, divorce, live, mom, boyfriend, would, rape..."
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"[need, someone, talk, guy, high, school, need, talk, someone, feel, like, ta..."


## Save Cleaned Dataset

In [203]:
processed_df = df.copy()
processed_df.to_csv('processed_data_12000.csv', index = False)

In [187]:
processed_df.head(10)

Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"[get, support, please, depressed, used, even, posted, r, advice, ani, mals, ..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"[everything, going, wrong, trying, drink, everyone, accusing, drinking, secr..."
2,i am done fighting it.*gone*,depression,"[done, fighting, gone]"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"[today, cut, hair, hair, always, thick, mess, curls, went, way, waist, even,..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"[know, hopes, future, kinda, tough, start, ill, try, sum, best, depressed, c..."
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"[tired, life, tired, living, know, hey, guys, life, beating, hell, one, rela..."
6,what is one concrete thing that has helped you in your battle against depres...,depression,"[one, concrete, thing, helped, battle, depression, could, useful, ideas, sta..."
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"[mental, health, go, hand, hand, physical, health, feel, worst, get, sick, t..."
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"[thing, hurts, knowing, worse, fifteen, divorce, lived, mom, boyfriend, woul..."
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"[need, someone, talk, guy, high, school, need, talk, someone, feel, like, ta..."


## Split Train Test Data

In [None]:
from sklearn.model_selection import train_test_split

train_data ,test_data = train_test_split(processed_df,test_size=0.2,random_state=64)

print('Training data: ',len(train_data))
print('Testing data: ',len(test_data))

Training data:  9600
Testing data:  2400


## Convert tokenized text from nltk to keras

In [None]:
train_data.head(10)

Unnamed: 0,text,class,tokens
11394,"i wish girls were not so sammy it is hard to find a quirky diferent girl, it...",teenagers,"[wish, girl, sammy, hard, find, quirky, diferent, girl, look, like, girl, pr..."
7832,i feel like killing myself because of moderight now technology.incoming rant...,SuicideWatch,"[feel, like, kill, mode, right, technology, incoming, rant, throwaway, accou..."
8125,i kinda got behind schedule with learight nowing for next week testweek i ha...,teenagers,"[kinda, get, behind, schedule, lea, right, wing, next, week, test, week, tes..."
8082,i wish people did not have to see or hear me or anything i do not know what ...,teenagers,"[wish, people, see, hear, anything, know, want, actually, know, like, dig, a..."
10409,why though like fr no cute guys/gals are my age😔 big sad lmao and if they ar...,teenagers,"[though, like, fr, cute, guy, gal, age, big, sad, l, mao, take]"
5174,upcoming appointment with doctor(18 m) okay so in about 20 days i have an ap...,SuicideWatch,"[upcoming, appointment, doctor, okay, day, appointment, doctor, appointment,..."
10062,gay p0right now is the manliest type of p0right now you already know why,teenagers,"[gay, p, right, manly, type, p, right, already, know]"
9881,i would like people to notice someone. u/belowavg_buddy is already a really ...,teenagers,"[would, like, people, notice, someone, u, avg, buddy, already, really, creat..."
1438,buright nowing alive for four years. looking for one more chance at life.aft...,depression,"[bu, right, wing, alive, four, year, look, one, chance, life, overcome, life..."
9723,trynna get coins play this strategic military game for free. get wartroops 1...,teenagers,"[try, nn, get, coin, play, strategic, military, game, free, get, war, troop,..."


In [None]:
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Initialize Keras Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")  # Optional: Handle out-of-vocabulary words
tokenizer.fit_on_texts(train_data['text'])

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {vocab_size}")

Vocabulary Size: 30979


In [None]:
MAX_LENGTH = 200

## Padding

In [None]:
train_data_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
test_data_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

print(f"Train Data Shape: {train_data_padded.shape}")
print(f"Test Data Shape: {test_data_padded.shape}")


Train Data Shape: (9600, 100)
Test Data Shape: (2400, 100)


In [None]:
train_data_padded

array([[  2, 254, 441, ...,   0,   0,   0],
       [  2,  29,  25, ..., 305,  12, 324],
       [  2, 537,  95, ...,   0,   0,   0],
       ...,
       [  2, 539, 138, ...,   0,   0,   0],
       [  2,  10, 585, ...,   0,   0,   0],
       [  7, 313, 236, ...,  57,   2,  13]], dtype=int32)

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

lbl_target=LabelEncoder()
train_output=lbl_target.fit_transform(train_data['class'])
test_output=lbl_target.fit_transform(test_data['class'])

## Word Embedding

In [None]:
# Download Glove
path = kagglehub.dataset_download("danieltujo/glove6b100dtxt")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/danieltujo/glove6b100dtxt/versions/1


In [None]:
path_GloVe = os.path.join(path, "glove.6B.100d.txt")
num_tokens = vocab_size
embedding_dim = 100
hits = 0
misses = 0
embedding_index = {}

In [None]:
# Read word vectors
with open(path_GloVe) as f:
    for line in f:
        word, coef = line.split(maxsplit=1)
        coef = np.fromstring(coef, "f", sep=" ")
        embedding_index[word] = coef
print("Found %s word vectors" % len(embedding_index))

# Assign word vectors to our dictionary/vocab
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print('Converted %d words (%d misses)' % (hits, misses))

Found 400000 word vectors
Converted 21755 words (9223 misses)


# Modeling

## Step 1 Suicide / Non-Suicide

In [204]:
processed_df['suicide_class'] = processed_df['class'].apply(lambda x: 'suicide' if x == 'SuicideWatch' else 'nonsuicide')
processed_df.head(10)

Unnamed: 0,text,class,tokens,suicide_class
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"[get, support, please, depress, use, even, post, r, advice, ani, mal, medica...",nonsuicide
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"[everything, go, wrong, trying, drink, everyone, accuse, drink, secret, unde...",nonsuicide
2,i am done fighting it.*gone*,depression,"[do, fighting, go]",nonsuicide
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"[today, cut, hair, hair, always, thick, mess, curl, go, way, waist, even, lo...",nonsuicide
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"[know, hope, future, kinda, tough, start, ill, try, sum, well, depress, coup...",nonsuicide
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"[tired, life, tired, living, know, hey, guy, life, beat, hell, one, relation...",nonsuicide
6,what is one concrete thing that has helped you in your battle against depres...,depression,"[one, concrete, thing, help, battle, depression, could, useful, idea, start,...",nonsuicide
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"[mental, health, go, hand, hand, physical, health, feel, worst, get, sick, t...",nonsuicide
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"[thing, hurt, know, bad, fifteen, divorce, live, mom, boyfriend, would, rape...",nonsuicide
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"[need, someone, talk, guy, high, school, need, talk, someone, feel, like, ta...",nonsuicide


In [170]:
nltk.download('punkt_tab')
nltk.download('punkt')

# Tokenization
processed_df["tokens"] = processed_df["text"].apply(lambda x: word_tokenize(str(x).lower()))

# Encode labels
label_encoder = LabelEncoder()
processed_df["label"] = label_encoder.fit_transform(processed_df["suicide_class"])
num_classes = len(label_encoder.classes_)

# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(processed_df["tokens"], processed_df["label"], test_size=0.2, random_state=64)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [190]:
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

In [172]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

In [173]:
max_len = 100  # Max length for padding

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

In [174]:
def build_rnn_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        SimpleRNN(128),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_cnn_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_bilstm_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        Bidirectional(LSTM(128)),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_and_evaluate(model, train_padded, train_labels, test_padded, test_labels, epochs=10, batch_size=32):
    model.fit(train_padded, train_labels, epochs=epochs, batch_size=batch_size, validation_data=(test_padded, test_labels))
    loss, acc = model.evaluate(test_padded, test_labels)
    print(f"Test Accuracy: {acc:.4f}")

In [175]:
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with Word2Vec Embeddings...
Epoch 1/10




[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.6919 - loss: 0.5866 - val_accuracy: 0.7608 - val_loss: 0.4852
Epoch 2/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.7742 - loss: 0.4723 - val_accuracy: 0.7862 - val_loss: 0.4421
Epoch 3/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.7916 - loss: 0.4336 - val_accuracy: 0.7975 - val_loss: 0.4443
Epoch 4/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.8139 - loss: 0.3961 - val_accuracy: 0.7917 - val_loss: 0.4533
Epoch 5/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.8300 - loss: 0.3623 - val_accuracy: 0.7962 - val_loss: 0.4670
Epoch 6/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.8527 - loss: 0.3164 - val_accuracy: 0.7908 - val_loss: 0.5099
Epoch 7/10
[1m300/300[0m [32m━

In [176]:
print("Training CNN with Word2Vec Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

Training CNN with Word2Vec Embeddings...
Epoch 1/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.6580 - loss: 0.8198 - val_accuracy: 0.7442 - val_loss: 0.5185
Epoch 2/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7454 - loss: 0.5179 - val_accuracy: 0.7704 - val_loss: 0.4716
Epoch 3/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7626 - loss: 0.4799 - val_accuracy: 0.7763 - val_loss: 0.4582
Epoch 4/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7968 - loss: 0.4279 - val_accuracy: 0.7937 - val_loss: 0.4436
Epoch 5/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8160 - loss: 0.3870 - val_accuracy: 0.7600 - val_loss: 0.4830
Epoch 6/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8444 - loss: 0.3418 - val_accuracy: 0.7738 - val_

### Tokenized


In [205]:
label_encoder = LabelEncoder()
processed_df["label"] = label_encoder.fit_transform(processed_df["suicide_class"])
num_classes = len(label_encoder.classes_)

In [206]:
train_texts, test_texts, train_labels, test_labels = train_test_split(processed_df["tokens"], processed_df["label"], test_size=0.2, random_state=64)

# Train Word2Vec and FastText models
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts.apply(' '.join))  # Join tokens back to text for the tokenizer
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

max_len = 100  # Max length for padding

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts.apply(' '.join))  # Join tokens for sequences
test_sequences = tokenizer.texts_to_sequences(test_texts.apply(' '.join))

# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Train Bi-LSTM with Word2Vec embeddings
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with Word2Vec Embeddings...
Epoch 1/10




[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.7129 - loss: 0.5510 - val_accuracy: 0.7817 - val_loss: 0.4518
Epoch 2/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.7781 - loss: 0.4615 - val_accuracy: 0.7992 - val_loss: 0.4276
Epoch 3/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.7814 - loss: 0.4512 - val_accuracy: 0.8008 - val_loss: 0.4335
Epoch 4/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.8122 - loss: 0.4135 - val_accuracy: 0.8129 - val_loss: 0.4109
Epoch 5/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8120 - loss: 0.4073 - val_accuracy: 0.8079 - val_loss: 0.4142
Epoch 6/10
[1m300/300[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.8173 - loss: 0.3933 - val_accuracy: 0.8150 - val_loss: 0.4061
Epoch 7/10
[1m300/300[0m [32m━