<a href="https://colab.research.google.com/github/Ph1lipXu/Machine-Learning-on-Suicide-and-Depression-Detection/blob/main/Data_Cleaning%26Resampling_with_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!pip install tensorflow gensim nltk scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, SimpleRNN, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
import gensim
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder




pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)



In [None]:
import kagglehub

# Download selected version
path = kagglehub.dataset_download("nikhileswarkomati/suicide-watch/versions/13")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nikhileswarkomati/suicide-watch?dataset_version_number=13...


100%|██████████| 115M/115M [00:02<00:00, 50.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/nikhileswarkomati/suicide-watch/versions/13


In [None]:
print(os.listdir(path)) # path is a directory

['SuicideAndDepression_Detection.csv']


In [None]:
file_path = os.path.join(path, "SuicideAndDepression_Detection.csv")
data = pd.read_csv(file_path)
data.head(20)

Unnamed: 0,text,class
0,Does life actually work for most / non-depressed people?It doesn't seem poss...,depression
1,"I found my friend's bodyIt was almost nine years ago now, but I still think ...",depression
2,Ex Wife Threatening SuicideRecently I left my wife for good because she has ...,SuicideWatch
3,Am I weird I don't get affected by compliments if it's coming from someone I...,teenagers
4,"Finally 2020 is almost over... So I can never hear ""2020 has been a bad year...",teenagers
5,"Reddit, I've never opened up to anyone with my life problems as much i am no...",depression
6,Somebody help me.I just had a terrible episode tonight. I feel hollow inside...,depression
7,I can't do this anymoreI've hidden away all summer in my room and I can't ev...,depression
8,i need helpjust help me im crying so hard,SuicideWatch
9,"I’m so lostHello, my name is Adam (16) and I’ve been struggling for years an...",SuicideWatch


# Data Cleaning&Preparation

In [None]:
# Three classes in total, all balanced.
data['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
SuicideWatch,116037
teenagers,116037
depression,116036


In [None]:
# Missing data only accounts for a extremely small part of the whole dataset.
data_len = len(data)
text_missing = data['text'].isna().sum() / data_len
print("Percentage of missing text data: ", "less than 1%" if text_missing < 0.01 else "{:.4f}".format(x))

class_missing = data['class'].isna().sum() / data_len
print("Percentage of missing class data: ", "less than 1%" if class_missing < 0.01 else "{:.4f}".format(x))

Percentage of missing text data:  less than 1%
Percentage of missing class data:  less than 1%


In [None]:
# Drop NA values, reset index
data.dropna(inplace = True)
data.reset_index(drop = True, inplace = True)

print(data['text'].isna().sum())
print(data['class'].isna().sum())

0
0


In [None]:
# No duplication in data
print(data['text'].duplicated().sum())

0


# Random Sampling

In [None]:
classes = data['class'].unique()
class_size = 200  # 600 total, 3 classes

sampled_data = pd.DataFrame()
for cls in classes:
    class_data = data[data['class'] == cls]
    sampled_class_data = class_data.sample(n = class_size, random_state = 64)
    sampled_data = pd.concat([sampled_data, sampled_class_data])

# Reset Index
sampled_data.reset_index(drop = True, inplace = True)

In [None]:
sampled_data.to_csv('sampled_data_600.csv', index = False)

In [None]:
# Load dataset (Ensure the dataset has 'text' and 'label' columns)
df = pd.read_csv(file_path)
# !pip uninstall nltk
# !pip install nltkY
# import nltk
nltk.download('punkt_tab')
nltk.download('punkt')

# Tokenization
df["tokens"] = df["text"].apply(lambda x: word_tokenize(str(x).lower()))

# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["class"])
num_classes = len(label_encoder.classes_)

# Split dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(df["tokens"], df["label"], test_size=0.2, random_state=42)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]


In [None]:
max_len = 100  # Max length for padding

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)


In [None]:
def build_rnn_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        SimpleRNN(128),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [None]:
def build_cnn_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [None]:
def build_bilstm_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        Bidirectional(LSTM(128)),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [None]:
def train_and_evaluate(model, train_padded, train_labels, test_padded, test_labels, epochs=10, batch_size=32):
    model.fit(train_padded, train_labels, epochs=epochs, batch_size=batch_size, validation_data=(test_padded, test_labels))
    loss, acc = model.evaluate(test_padded, test_labels)
    print(f"Test Accuracy: {acc:.4f}")


In [None]:
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training RNN with Word2Vec Embeddings...")
rnn_model = build_rnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(rnn_model, train_padded, train_labels, test_padded, test_labels)

print("Training CNN with Word2Vec Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)


Training RNN with Word2Vec Embeddings...




Epoch 1/10
[1m8704/8704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m360s[0m 41ms/step - accuracy: 0.5269 - loss: 0.9674 - val_accuracy: 0.5423 - val_loss: 0.9334
Epoch 2/10
[1m8704/8704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 41ms/step - accuracy: 0.5501 - loss: 0.9278 - val_accuracy: 0.5479 - val_loss: 0.9276
Epoch 3/10
[1m8704/8704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m359s[0m 41ms/step - accuracy: 0.5636 - loss: 0.9162 - val_accuracy: 0.5946 - val_loss: 0.9643
Epoch 4/10
[1m8704/8704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m358s[0m 41ms/step - accuracy: 0.5733 - loss: 0.9127 - val_accuracy: 0.5629 - val_loss: 0.8534
Epoch 5/10
[1m8704/8704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 41ms/step - accuracy: 0.5704 - loss: 0.8865 - val_accuracy: 0.6109 - val_loss: 0.8154
Epoch 6/10
[1m8704/8704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m368s[0m 40ms/step - accuracy: 0.5759 - loss: 0.8830 - val_accuracy: 0.5488 - val_loss: 0.944