<a href="https://colab.research.google.com/github/Ph1lipXu/Machine-Learning-on-Suicide-and-Depression-Detection/blob/main/Modeling_16000.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [2]:
#!pip install tensorflow nltk scikit-learn
#!pip install gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, SimpleRNN, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import gensim
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)

In [3]:
!wget https://raw.githubusercontent.com/Ph1lipXu/Machine-Learning-on-Suicide-and-Depression-Detection/refs/heads/main/data/cleaned_data_16000.csv

--2025-04-10 19:50:01--  https://raw.githubusercontent.com/Ph1lipXu/Machine-Learning-on-Suicide-and-Depression-Detection/refs/heads/main/data/cleaned_data_16000.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36085156 (34M) [text/plain]
Saving to: ‘cleaned_data_16000.csv’


2025-04-10 19:50:03 (112 MB/s) - ‘cleaned_data_16000.csv’ saved [36085156/36085156]



In [4]:
df = pd.read_csv("/content/cleaned_data_16000.csv")
df.head()

Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"['can', 'I', 'get', 'some', 'support', 'please', 'so', 'I', 'be', 'not', 'as..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"['everything', 'be', 'go', 'wrong', 'have', 'be', 'try', 'not', 'to', 'drink..."
2,i am done fighting it.*gone*,depression,"['I', 'be', 'do', 'fight', 'it', 'go']"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"['today', 'I', 'cut', 'my', 'hairmy', 'hair', 'have', 'always', 'be', 'a', '..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"['I', 'do', 'not', 'know', 'what', 'to', 'do', 'and', 'I', 'have', 'no', 'ho..."


# Modeling

In [6]:
df['suicide_class'] = df['class'].apply(lambda x: 'suicide' if x == 'SuicideWatch' else 'nonsuicide')
df['depression_class'] = df['class'].apply(lambda x: 'depression' if x == 'depression' else 'nondepression')
df['teenager_class'] = df['class'].apply(lambda x: 'teenager' if x == 'teenagers' else 'nonteenager')

In [8]:
df.head(10)

Unnamed: 0,text,class,tokens,suicide_class,depression_class,teenager_class
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"['can', 'I', 'get', 'some', 'support', 'please', 'so', 'I', 'be', 'not', 'as...",nonsuicide,depression,nonteenager
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"['everything', 'be', 'go', 'wrong', 'have', 'be', 'try', 'not', 'to', 'drink...",nonsuicide,depression,nonteenager
2,i am done fighting it.*gone*,depression,"['I', 'be', 'do', 'fight', 'it', 'go']",nonsuicide,depression,nonteenager
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"['today', 'I', 'cut', 'my', 'hairmy', 'hair', 'have', 'always', 'be', 'a', '...",nonsuicide,depression,nonteenager
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"['I', 'do', 'not', 'know', 'what', 'to', 'do', 'and', 'I', 'have', 'no', 'ho...",nonsuicide,depression,nonteenager
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"['tired', 'of', 'life', 'tired', 'of', 'live', 'do', 'not', 'know', 'what', ...",nonsuicide,depression,nonteenager
6,what is one concrete thing that has helped you in your battle against depres...,depression,"['what', 'be', 'one', 'concrete', 'thing', 'that', 'have', 'help', 'you', 'i...",nonsuicide,depression,nonteenager
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"['do', 'mental', 'health', 'go', 'hand', 'in', 'hand', 'with', 'the', 'physi...",nonsuicide,depression,nonteenager
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"['the', 'thing', 'that', 'hurt', 'the', 'most', 'be', 'know', 'that', 'I', '...",nonsuicide,depression,nonteenager
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"['need', 'someone', 'to', 'talk', 'toi', 'be', 'a', 'guy', 'in', 'high', 'sc...",nonsuicide,depression,nonteenager


## Suicide / Non-Suicide

In [None]:
# Encode labels
label_encoder = LabelEncoder()
processed_df["label"] = label_encoder.fit_transform(processed_df["suicide_class"])
num_classes = len(label_encoder.classes_)

In [None]:
from sklearn.model_selection import train_test_split

# Instead of random train test split, stratify by class
train_texts, test_texts, train_labels, test_labels = train_test_split(
    processed_df["tokens"], processed_df["label"], random_state=64, stratify=processed_df['label']
)
print('Training data: ',len(train_texts))
print('Testing data: ',len(test_texts))

Training data:  12000
Testing data:  4000


In [None]:
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

In [None]:
print(train_texts.head(10))

1435     [anyone, else, feel, like, this, I, have, be, cope, with, depression, for, a...
1368     [lose, my, sense, of, realityit, be, another, one, of, thoughse, night, and,...
5863                                                    [good, way, to, commit, suicide]
8929     [I, be, bakk, you, lousy, son, of, bitch, thoughught, I, be, do, I, give, yo...
15448                                                                [no, text, content]
11663    [just, realize, how, close, my, teenage, year, be, to, finish, I, be, and, I...
3897     [just, a, thoughught, that, enter, my, mind, lie, in, bed, at, not, able, to...
4635     [why, when, I, die, the, world, will, not, stop, spin, nothing, change, ti, ...
14158                                                                [no, text, content]
13454                                                                [no, text, content]
Name: tokens, dtype: object


### Vectorization/Embedding

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Padding sequences to have the same length
max_len = 200  # Max length for padding
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Vocabulary size
vocab_size = len(word_index) + 1

In [None]:
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

In [None]:
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import f1_score
class F1ScoreCallback(Callback):
    def __init__(self, validation_data):
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        val_data, val_labels = self.validation_data
        val_preds = self.model.predict(val_data)
        val_preds = np.argmax(val_preds, axis=1)  # Convert probabilities to class labels
        val_labels = np.argmax(val_labels, axis=1)  # Convert one-hot labels to class labels

        f1 = f1_score(val_labels, val_preds, average='weighted')  # Change to 'macro' if needed
        print(f" - val_f1: {f1:.4f}")
        logs["val_f1"] = f1  # Store it in logs if needed

In [None]:
def build_cnn_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_rnn_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        LSTM(128, return_sequences=False),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_bilstm_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        Bidirectional(LSTM(128)),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_and_evaluate(model, train_padded, train_labels, test_padded, test_labels, epochs=10, batch_size=32):
    f1_callback = F1ScoreCallback(validation_data=(test_padded, test_labels))

    model.fit(train_padded, train_labels,
              epochs=epochs, batch_size=batch_size,
              validation_data=(test_padded, test_labels),
              callbacks=[f1_callback])

    loss, acc = model.evaluate(test_padded, test_labels)
    print(f"Test Accuracy: {acc:.4f}")

In [None]:
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with Word2Vec Embeddings...




Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 184ms/step
 - val_f1: 0.8046
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m253s[0m 660ms/step - accuracy: 0.7535 - loss: 0.4779 - val_accuracy: 0.8127 - val_loss: 0.3835 - val_f1: 0.8046
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 182ms/step
 - val_f1: 0.8145
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 660ms/step - accuracy: 0.8110 - loss: 0.3813 - val_accuracy: 0.8242 - val_loss: 0.3606 - val_f1: 0.8145
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 185ms/step
 - val_f1: 0.7994
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 726ms/step - accuracy: 0.8303 - loss: 0.3551 - val_accuracy: 0.8150 - val_loss: 0.4182 - val_f1: 0.7994
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 184ms/step
 - val_f1: 0.8228
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
print("Training Bi-LSTM with FastText Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with FastText Embeddings...
Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 185ms/step
 - val_f1: 0.7444
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 681ms/step - accuracy: 0.7477 - loss: 0.4818 - val_accuracy: 0.7845 - val_loss: 0.3952 - val_f1: 0.7444
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 186ms/step
 - val_f1: 0.8111
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m277s[0m 722ms/step - accuracy: 0.7967 - loss: 0.4043 - val_accuracy: 0.8045 - val_loss: 0.3845 - val_f1: 0.8111
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 191ms/step
 - val_f1: 0.7504
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 729ms/step - accuracy: 0.8294 - loss: 0.3570 - val_accuracy: 0.7383 - val_loss: 0.4499 - val_f1: 0.7504
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 186ms/step
 - val_f1: 0.8132
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m303s[0m

In [None]:
print("Training CNN with Word2Vec Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

Training CNN with Word2Vec Embeddings...
Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step
 - val_f1: 0.7749
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 62ms/step - accuracy: 0.7245 - loss: 0.6417 - val_accuracy: 0.8025 - val_loss: 0.3911 - val_f1: 0.7749
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step
 - val_f1: 0.8299
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 60ms/step - accuracy: 0.8186 - loss: 0.3763 - val_accuracy: 0.8307 - val_loss: 0.3693 - val_f1: 0.8299
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.8365
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 58ms/step - accuracy: 0.8435 - loss: 0.3392 - val_accuracy: 0.8388 - val_loss: 0.3523 - val_f1: 0.8365
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step
 - val_f1: 0.8305
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 60ms/step - ac

In [None]:
print("Training CNN with FastText Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

Training CNN with FastText Embeddings...
Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.6463
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 62ms/step - accuracy: 0.7104 - loss: 0.6730 - val_accuracy: 0.7513 - val_loss: 0.4629 - val_f1: 0.6463
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.7725
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 59ms/step - accuracy: 0.7590 - loss: 0.4579 - val_accuracy: 0.7990 - val_loss: 0.4166 - val_f1: 0.7725
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step
 - val_f1: 0.7673
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 65ms/step - accuracy: 0.7898 - loss: 0.4229 - val_accuracy: 0.8000 - val_loss: 0.4021 - val_f1: 0.7673
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.8186
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 58ms/step - ac

### Tokenized


In [None]:
label_encoder = LabelEncoder()
processed_df["label"] = label_encoder.fit_transform(processed_df["suicide_class"])
num_classes = len(label_encoder.classes_)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(processed_df["tokens"], processed_df["label"], test_size=0.2, random_state=64)

# Train Word2Vec and FastText models
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts.apply(' '.join))  # Join tokens back to text for the tokenizer
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

max_len = 100  # Max length for padding

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts.apply(' '.join))  # Join tokens for sequences
test_sequences = tokenizer.texts_to_sequences(test_texts.apply(' '.join))

# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Train Bi-LSTM with Word2Vec embeddings
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with Word2Vec Embeddings...
Epoch 1/10




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 96ms/step
 - val_f1: 0.7907
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 302ms/step - accuracy: 0.7560 - loss: 0.4691 - val_accuracy: 0.8072 - val_loss: 0.3875 - val_f1: 0.7907
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 97ms/step
 - val_f1: 0.8307
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 297ms/step - accuracy: 0.8197 - loss: 0.3654 - val_accuracy: 0.8328 - val_loss: 0.3527 - val_f1: 0.8307
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 96ms/step
 - val_f1: 0.8270
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 297ms/step - accuracy: 0.8354 - loss: 0.3416 - val_accuracy: 0.8309 - val_loss: 0.3565 - val_f1: 0.8270
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 104ms/step
 - val_f1: 0.7921
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m166s[0m 35

## Depression / Non-Depression

In [None]:
depression_df['depression_class'] = depression_df['class'].apply(lambda x: 'depression' if x == 'depression' else 'nondepression')

In [None]:
# Encode labels
label_encoder = LabelEncoder()
depression_df["label"] = label_encoder.fit_transform(depression_df["depression_class"])
num_classes = len(label_encoder.classes_)

In [None]:
from sklearn.model_selection import train_test_split

# Instead of random train test split, stratify by class
train_texts, test_texts, train_labels, test_labels = train_test_split(
    depression_df["tokens"], depression_df["label"], random_state=64, stratify=depression_df['label']
)

In [None]:
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

### Vectorization/Embedding

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Padding sequences to have the same length
max_len = 200  # Max length for padding
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Vocabulary size
vocab_size = len(word_index) + 1

In [None]:
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

In [None]:
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with Word2Vec Embeddings...
Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 182ms/step
 - val_f1: 0.7729
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 611ms/step - accuracy: 0.7374 - loss: 0.4883 - val_accuracy: 0.7828 - val_loss: 0.4120 - val_f1: 0.7729
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 183ms/step
 - val_f1: 0.7709
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m296s[0m 705ms/step - accuracy: 0.8076 - loss: 0.3888 - val_accuracy: 0.7815 - val_loss: 0.4130 - val_f1: 0.7709
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 176ms/step
 - val_f1: 0.8040
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 712ms/step - accuracy: 0.8210 - loss: 0.3690 - val_accuracy: 0.8008 - val_loss: 0.4021 - val_f1: 0.8040
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 179ms/step
 - val_f1: 0.7947
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m290s[0m

In [None]:
print("Training Bi-LSTM with FastText Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with FastText Embeddings...
Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 183ms/step
 - val_f1: 0.7544
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m234s[0m 611ms/step - accuracy: 0.7471 - loss: 0.4862 - val_accuracy: 0.7720 - val_loss: 0.4271 - val_f1: 0.7544
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 181ms/step
 - val_f1: 0.7745
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 667ms/step - accuracy: 0.7752 - loss: 0.4164 - val_accuracy: 0.7720 - val_loss: 0.4150 - val_f1: 0.7745
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 186ms/step
 - val_f1: 0.7767
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 669ms/step - accuracy: 0.7956 - loss: 0.3904 - val_accuracy: 0.7872 - val_loss: 0.4048 - val_f1: 0.7767
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 173ms/step
 - val_f1: 0.7773
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m

In [None]:
print("Training CNN with Word2Vec Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

Training CNN with Word2Vec Embeddings...
Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.7455
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 62ms/step - accuracy: 0.7287 - loss: 0.5791 - val_accuracy: 0.7897 - val_loss: 0.4091 - val_f1: 0.7455
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.8069
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 58ms/step - accuracy: 0.7999 - loss: 0.4054 - val_accuracy: 0.8160 - val_loss: 0.3857 - val_f1: 0.8069
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step
 - val_f1: 0.8186
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 66ms/step - accuracy: 0.8152 - loss: 0.3702 - val_accuracy: 0.8195 - val_loss: 0.3854 - val_f1: 0.8186
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.8092
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 62ms/step - ac

In [None]:
print("Training CNN with FastText Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

Training CNN with FastText Embeddings...
Epoch 1/10




[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step
 - val_f1: 0.6427
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 62ms/step - accuracy: 0.7141 - loss: 0.6985 - val_accuracy: 0.7498 - val_loss: 0.4588 - val_f1: 0.6427
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.6440
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 57ms/step - accuracy: 0.7494 - loss: 0.4668 - val_accuracy: 0.7505 - val_loss: 0.4421 - val_f1: 0.6440
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.7338
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 61ms/step - accuracy: 0.7609 - loss: 0.4383 - val_accuracy: 0.7695 - val_loss: 0.4333 - val_f1: 0.7338
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step
 - val_f1: 0.6432
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 58ms/step - ac

### Tokenized

In [None]:
label_encoder = LabelEncoder()
depression_df["label"] = label_encoder.fit_transform(processed_df["depression_class"])
num_classes = len(label_encoder.classes_)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(depression_df["tokens"], depression_df["label"], test_size=0.2, random_state=64)

# Train Word2Vec and FastText models
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts.apply(' '.join))  # Join tokens back to text for the tokenizer
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

max_len = 100  # Max length for padding

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts.apply(' '.join))  # Join tokens for sequences
test_sequences = tokenizer.texts_to_sequences(test_texts.apply(' '.join))

# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Train Bi-LSTM with Word2Vec embeddings
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with Word2Vec Embeddings...
Epoch 1/10




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 84ms/step
 - val_f1: 0.7724
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 314ms/step - accuracy: 0.7491 - loss: 0.4769 - val_accuracy: 0.7653 - val_loss: 0.4281 - val_f1: 0.7724
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 84ms/step
 - val_f1: 0.7987
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 313ms/step - accuracy: 0.7780 - loss: 0.4067 - val_accuracy: 0.7987 - val_loss: 0.4005 - val_f1: 0.7987
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 96ms/step
 - val_f1: 0.8063
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 304ms/step - accuracy: 0.8057 - loss: 0.3854 - val_accuracy: 0.8169 - val_loss: 0.3807 - val_f1: 0.8063
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 85ms/step
 - val_f1: 0.7890
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 305ms/

## Teenager / Non-Teenager


In [None]:
teenager_df['teenager_class'] = teenager_df['class'].apply(lambda x: 'teenager' if x == 'teenagers' else 'nonteenager')

In [None]:
# Encode labels
label_encoder = LabelEncoder()
teenager_df["label"] = label_encoder.fit_transform(teenager_df["teenager_class"])
num_classes = len(label_encoder.classes_)

### Vectorization/Embedding

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Padding sequences to have the same length
max_len = 200  # Max length for padding
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Vocabulary size
vocab_size = len(word_index) + 1

In [None]:
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

In [None]:
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with Word2Vec Embeddings...
Epoch 1/10




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 191ms/step
 - val_f1: 0.7761
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 604ms/step - accuracy: 0.7602 - loss: 0.4726 - val_accuracy: 0.7922 - val_loss: 0.4083 - val_f1: 0.7761
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 175ms/step
 - val_f1: 0.7686
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 595ms/step - accuracy: 0.7960 - loss: 0.4024 - val_accuracy: 0.7891 - val_loss: 0.3990 - val_f1: 0.7686
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 189ms/step
 - val_f1: 0.7953
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 592ms/step - accuracy: 0.8080 - loss: 0.3873 - val_accuracy: 0.8034 - val_loss: 0.3856 - val_f1: 0.7953
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 176ms/step
 - val_f1: 0.7965
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m

In [None]:
print("Training Bi-LSTM with FastText Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with FastText Embeddings...
Epoch 1/10




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 177ms/step
 - val_f1: 0.7654
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 596ms/step - accuracy: 0.7525 - loss: 0.4873 - val_accuracy: 0.7722 - val_loss: 0.4323 - val_f1: 0.7654
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 177ms/step
 - val_f1: 0.7682
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 605ms/step - accuracy: 0.7839 - loss: 0.4079 - val_accuracy: 0.7750 - val_loss: 0.4535 - val_f1: 0.7682
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 191ms/step
 - val_f1: 0.7843
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m278s[0m 646ms/step - accuracy: 0.7914 - loss: 0.4023 - val_accuracy: 0.8016 - val_loss: 0.4325 - val_f1: 0.7843
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 176ms/step
 - val_f1: 0.7626
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m

In [None]:
print("Training CNN with Word2Vec Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

Training CNN with Word2Vec Embeddings...
Epoch 1/10




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.7940
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 57ms/step - accuracy: 0.7352 - loss: 0.5898 - val_accuracy: 0.7912 - val_loss: 0.4059 - val_f1: 0.7940
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step
 - val_f1: 0.8070
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 57ms/step - accuracy: 0.8074 - loss: 0.3904 - val_accuracy: 0.8103 - val_loss: 0.3935 - val_f1: 0.8070
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.8138
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 58ms/step - accuracy: 0.8265 - loss: 0.3597 - val_accuracy: 0.8234 - val_loss: 0.3824 - val_f1: 0.8138
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.8082
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 62ms/step - ac

In [None]:
print("Training CNN with FastText Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

Training CNN with FastText Embeddings...
Epoch 1/10




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.6408
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 57ms/step - accuracy: 0.7052 - loss: 0.8155 - val_accuracy: 0.7484 - val_loss: 0.4657 - val_f1: 0.6408
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step
 - val_f1: 0.6408
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 54ms/step - accuracy: 0.7511 - loss: 0.4715 - val_accuracy: 0.7484 - val_loss: 0.4603 - val_f1: 0.6408
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step
 - val_f1: 0.7165
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 60ms/step - accuracy: 0.7545 - loss: 0.4477 - val_accuracy: 0.7600 - val_loss: 0.4434 - val_f1: 0.7165
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step
 - val_f1: 0.7633
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 59ms/step - ac

In [None]:
print("Training RNN with Word2Vec Embeddings...")
rnn_model = build_rnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(rnn_model, train_padded, train_labels, test_padded, test_labels)

Training RNN with Word2Vec Embeddings...
Epoch 1/10




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 107ms/step
 - val_f1: 0.6494
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 339ms/step - accuracy: 0.7350 - loss: 0.5581 - val_accuracy: 0.7509 - val_loss: 0.5191 - val_f1: 0.6494
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 100ms/step
 - val_f1: 0.7195
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 339ms/step - accuracy: 0.7595 - loss: 0.5219 - val_accuracy: 0.7525 - val_loss: 0.4787 - val_f1: 0.7195
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 101ms/step
 - val_f1: 0.7231
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 338ms/step - accuracy: 0.7638 - loss: 0.4963 - val_accuracy: 0.7681 - val_loss: 0.4679 - val_f1: 0.7231
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 95ms/step
 - val_f1: 0.8087
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 

### Tokenized

In [None]:
label_encoder = LabelEncoder()
processed_df["label"] = label_encoder.fit_transform(processed_df["teenager_class"])
num_classes = len(label_encoder.classes_)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(depression_df["tokens"], depression_df["label"], test_size=0.2, random_state=64)

# Train Word2Vec and FastText models
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts.apply(' '.join))  # Join tokens back to text for the tokenizer
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

max_len = 100  # Max length for padding

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts.apply(' '.join))  # Join tokens for sequences
test_sequences = tokenizer.texts_to_sequences(test_texts.apply(' '.join))

# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Train Bi-LSTM with Word2Vec embeddings
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

Training Bi-LSTM with Word2Vec Embeddings...
Epoch 1/10




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 83ms/step
 - val_f1: 0.7885
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 309ms/step - accuracy: 0.7518 - loss: 0.4708 - val_accuracy: 0.7841 - val_loss: 0.4099 - val_f1: 0.7885
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 98ms/step
 - val_f1: 0.7844
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 303ms/step - accuracy: 0.7957 - loss: 0.3935 - val_accuracy: 0.7741 - val_loss: 0.4236 - val_f1: 0.7844
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 97ms/step
 - val_f1: 0.7760
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 305ms/step - accuracy: 0.8237 - loss: 0.3580 - val_accuracy: 0.8019 - val_loss: 0.4104 - val_f1: 0.7760
Epoch 4/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 98ms/step
 - val_f1: 0.8206
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 308m