<a href="https://colab.research.google.com/github/Ph1lipXu/Machine-Learning-on-Suicide-and-Depression-Detection/blob/main/Modeling_16000.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [91]:
!pip install tensorflow nltk scikit-learn
!pip install gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, SimpleRNN, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import gensim
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision = 4, suppress = True)



In [92]:
!wget https://raw.githubusercontent.com/Ph1lipXu/Machine-Learning-on-Suicide-and-Depression-Detection/refs/heads/main/data/cleaned_data_16000.csv

--2025-04-11 20:54:30--  https://raw.githubusercontent.com/Ph1lipXu/Machine-Learning-on-Suicide-and-Depression-Detection/refs/heads/main/data/cleaned_data_16000.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36085156 (34M) [text/plain]
Saving to: ‘cleaned_data_16000.csv.2’


2025-04-11 20:54:31 (188 MB/s) - ‘cleaned_data_16000.csv.2’ saved [36085156/36085156]



In [93]:
df = pd.read_csv("/content/cleaned_data_16000.csv")
df.head()

Unnamed: 0,text,class,tokens
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"['can', 'I', 'get', 'some', 'support', 'please', 'so', 'I', 'be', 'not', 'as..."
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"['everything', 'be', 'go', 'wrong', 'have', 'be', 'try', 'not', 'to', 'drink..."
2,i am done fighting it.*gone*,depression,"['I', 'be', 'do', 'fight', 'it', 'go']"
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"['today', 'I', 'cut', 'my', 'hairmy', 'hair', 'have', 'always', 'be', 'a', '..."
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"['I', 'do', 'not', 'know', 'what', 'to', 'do', 'and', 'I', 'have', 'no', 'ho..."


# Modeling

In [94]:
df['suicide_class'] = df['class'].apply(lambda x: 'suicide' if x == 'SuicideWatch' else 'nonsuicide')
df['depression_class'] = df['class'].apply(lambda x: 'depression' if x == 'depression' else 'nondepression')
df['teenager_class'] = df['class'].apply(lambda x: 'teenager' if x == 'teenagers' else 'nonteenager')

In [95]:
df.head(10)

Unnamed: 0,text,class,tokens,suicide_class,depression_class,teenager_class
0,can i get some support please...so i am not as depressed as i used to be (i ...,depression,"['can', 'I', 'get', 'some', 'support', 'please', 'so', 'I', 'be', 'not', 'as...",nonsuicide,depression,nonteenager
1,"everything is going wrong .i have been trying not to drink, but everyone is ...",depression,"['everything', 'be', 'go', 'wrong', 'have', 'be', 'try', 'not', 'to', 'drink...",nonsuicide,depression,nonteenager
2,i am done fighting it.*gone*,depression,"['I', 'be', 'do', 'fight', 'it', 'go']",nonsuicide,depression,nonteenager
3,today i cut my hairmy hair has always been a thick mess of curls that went a...,depression,"['today', 'I', 'cut', 'my', 'hairmy', 'hair', 'have', 'always', 'be', 'a', '...",nonsuicide,depression,nonteenager
4,i do not know what to do and i have no hopes for the future.it is kinda toug...,depression,"['I', 'do', 'not', 'know', 'what', 'to', 'do', 'and', 'I', 'have', 'no', 'ho...",nonsuicide,depression,nonteenager
5,"tired of life, tired of living. do not know what to do hey guys, \n\ni am 16...",depression,"['tired', 'of', 'life', 'tired', 'of', 'live', 'do', 'not', 'know', 'what', ...",nonsuicide,depression,nonteenager
6,what is one concrete thing that has helped you in your battle against depres...,depression,"['what', 'be', 'one', 'concrete', 'thing', 'that', 'have', 'help', 'you', 'i...",nonsuicide,depression,nonteenager
7,does mental health go hand in hand with the physical health?when i feel at m...,depression,"['do', 'mental', 'health', 'go', 'hand', 'in', 'hand', 'with', 'the', 'physi...",nonsuicide,depression,nonteenager
8,the thing that hurts the most is knowing that i have been through worse.when...,depression,"['the', 'thing', 'that', 'hurt', 'the', 'most', 'be', 'know', 'that', 'I', '...",nonsuicide,depression,nonteenager
9,need someone to talk toi am a guy in high school and i just need to talk to ...,depression,"['need', 'someone', 'to', 'talk', 'toi', 'be', 'a', 'guy', 'in', 'high', 'sc...",nonsuicide,depression,nonteenager


## Suicide / Non-Suicide

In [96]:
# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["suicide_class"])
num_classes = len(label_encoder.classes_)

In [97]:
from sklearn.model_selection import train_test_split

# Instead of random train test split, stratify by class
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["tokens"], df["label"], random_state=64, stratify=df['label']
)
print('Training data: ',len(train_texts))
print('Testing data: ',len(test_texts))

Training data:  12000
Testing data:  4000


In [98]:
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

In [99]:
print(train_texts.head(10))

1435     ['anyone', 'else', 'feel', 'like', 'this', 'I', 'have', 'be', 'cope', 'with'...
1368     ['lose', 'my', 'sense', 'of', 'realityit', 'be', 'another', 'one', 'of', 'th...
5863                                          ['good', 'way', 'to', 'commit', 'suicide']
8929     ['I', 'be', 'bakk', 'you', 'lousy', 'son', 'of', 'bitch', 'thoughught', 'I',...
15448    ['firstly', 'what', 'be', 'your', 'thoughught', 'on', 'the', 'titular', 'cha...
11663    ['just', 'realize', 'how', 'close', 'my', 'teenage', 'year', 'be', 'to', 'fi...
3897     ['just', 'a', 'thoughught', 'that', 'enter', 'my', 'mind', 'lie', 'in', 'bed...
4635     ['why', 'when', 'I', 'die', 'the', 'world', 'will', 'not', 'stop', 'spin', '...
14158    ['we', 'just', 'have', 'our', 'concrete', 'foundation', 'reinforce', 'so', '...
13454    ['my', 'wife', 'be', 'italian', 'and', 'my', 'do', 'not', 'speak', 'very', '...
Name: tokens, dtype: object


### Vectorization/Embedding

In [100]:
# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Padding sequences to have the same length
max_len = 200  # Max length for padding
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Vocabulary size
vocab_size = len(word_index) + 1

In [None]:
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]



In [None]:
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import f1_score
class F1ScoreCallback(Callback):
    def __init__(self, validation_data):
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        val_data, val_labels = self.validation_data
        val_preds = self.model.predict(val_data)
        val_preds = np.argmax(val_preds, axis=1)  # Convert probabilities to class labels
        val_labels = np.argmax(val_labels, axis=1)  # Convert one-hot labels to class labels

        f1 = f1_score(val_labels, val_preds, average='weighted')  # Change to 'macro' if needed
        print(f" - val_f1: {f1:.4f}")
        logs["val_f1"] = f1  # Store it in logs if needed

In [None]:
def build_cnn_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        Conv1D(128, 5, activation='relu'),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_rnn_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        LSTM(128, return_sequences=False),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_bilstm_model(vocab_size, embedding_matrix, num_classes):
    model = Sequential([
        Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_len, trainable=False),
        Bidirectional(LSTM(128)),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_and_evaluate(model, train_padded, train_labels, test_padded, test_labels, epochs=10, batch_size=32):
    f1_callback = F1ScoreCallback(validation_data=(test_padded, test_labels))

    model.fit(train_padded, train_labels,
              epochs=epochs, batch_size=batch_size,
              validation_data=(test_padded, test_labels),
              callbacks=[f1_callback])

    loss, acc = model.evaluate(test_padded, test_labels)
    print(f"Test Accuracy: {acc:.4f}")

In [None]:
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training Bi-LSTM with FastText Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training CNN with Word2Vec Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training CNN with FastText Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

### Tokenized


In [None]:
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["suicide_class"])
num_classes = len(label_encoder.classes_)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["tokens"], df["label"], test_size=0.2, random_state=64)

# Train Word2Vec and FastText models
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts.apply(' '.join))  # Join tokens back to text for the tokenizer
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

max_len = 100  # Max length for padding

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts.apply(' '.join))  # Join tokens for sequences
test_sequences = tokenizer.texts_to_sequences(test_texts.apply(' '.join))

# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Train Bi-LSTM with Word2Vec embeddings
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["tokens"], df["label"], test_size=0.2, random_state=64)

# Train Word2Vec and FastText models
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts.apply(' '.join))  # Join tokens back to text for the tokenizer
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

max_len = 100  # Max length for padding

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts.apply(' '.join))  # Join tokens for sequences
test_sequences = tokenizer.texts_to_sequences(test_texts.apply(' '.join))

# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Train Bi-LSTM with Word2Vec embeddings
print("Training RNN with Word2Vec Embeddings...")
bilstm_model = build_rnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

## Depression / Non-Depression

In [None]:
# df['depression_class'] = depression_df['class'].apply(lambda x: 'depression' if x == 'depression' else 'nondepression')

In [None]:
# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["depression_class"])
num_classes = len(label_encoder.classes_)

In [None]:
from sklearn.model_selection import train_test_split

# Instead of random train test split, stratify by class
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["tokens"], df["label"], random_state=64, stratify=df['label']
)

In [None]:
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

### Vectorization/Embedding

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Padding sequences to have the same length
max_len = 200  # Max length for padding
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Vocabulary size
vocab_size = len(word_index) + 1

In [None]:
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

In [None]:
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training Bi-LSTM with FastText Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training CNN with Word2Vec Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training CNN with FastText Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

### Tokenized

In [None]:
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["depression_class"])
num_classes = len(label_encoder.classes_)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["tokens"], df["label"], test_size=0.2, random_state=64)

# Train Word2Vec and FastText models
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts.apply(' '.join))  # Join tokens back to text for the tokenizer
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

max_len = 100  # Max length for padding

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts.apply(' '.join))  # Join tokens for sequences
test_sequences = tokenizer.texts_to_sequences(test_texts.apply(' '.join))

# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Train Bi-LSTM with Word2Vec embeddings
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["tokens"], df["label"], test_size=0.2, random_state=64)

# Train Word2Vec and FastText models
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts.apply(' '.join))  # Join tokens back to text for the tokenizer
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

max_len = 100  # Max length for padding

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts.apply(' '.join))  # Join tokens for sequences
test_sequences = tokenizer.texts_to_sequences(test_texts.apply(' '.join))

# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Train Bi-LSTM with Word2Vec embeddings
print("Training RNN with Word2Vec Embeddings...")
bilstm_model = build_rnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

## Teenager / Non-Teenager


In [None]:
df['teenager_class'] = df['class'].apply(lambda x: 'teenager' if x == 'teenagers' else 'nonteenager')

In [None]:
# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["teenager_class"])
num_classes = len(label_encoder.classes_)

### Vectorization/Embedding

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Padding sequences to have the same length
max_len = 200  # Max length for padding
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Vocabulary size
vocab_size = len(word_index) + 1

In [None]:
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

In [None]:
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training Bi-LSTM with FastText Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training CNN with Word2Vec Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training CNN with FastText Embeddings...")
cnn_model = build_cnn_model(vocab_size, embedding_matrix_ft, num_classes)
train_and_evaluate(cnn_model, train_padded, train_labels, test_padded, test_labels)

In [None]:
print("Training RNN with Word2Vec Embeddings...")
rnn_model = build_rnn_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(rnn_model, train_padded, train_labels, test_padded, test_labels)

### Tokenized

In [None]:
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["teenager_class"])
num_classes = len(label_encoder.classes_)

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df["tokens"], df["label"], test_size=0.2, random_state=64)

# Train Word2Vec and FastText models
word2vec_model = gensim.models.Word2Vec(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)
fasttext_model = gensim.models.FastText(sentences=train_texts.tolist(), vector_size=100, window=5, min_count=1, workers=4)

# Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts.apply(' '.join))  # Join tokens back to text for the tokenizer
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Create embedding matrices
embedding_matrix_w2v = np.zeros((vocab_size, 100))
embedding_matrix_ft = np.zeros((vocab_size, 100))

for word, i in word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix_w2v[i] = word2vec_model.wv[word]
    if word in fasttext_model.wv:
        embedding_matrix_ft[i] = fasttext_model.wv[word]

max_len = 100  # Max length for padding

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts.apply(' '.join))  # Join tokens for sequences
test_sequences = tokenizer.texts_to_sequences(test_texts.apply(' '.join))

# Padding sequences
train_padded = pad_sequences(train_sequences, maxlen=max_len, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Convert labels to categorical
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Train Bi-LSTM with Word2Vec embeddings
print("Training Bi-LSTM with Word2Vec Embeddings...")
bilstm_model = build_bilstm_model(vocab_size, embedding_matrix_w2v, num_classes)
train_and_evaluate(bilstm_model, train_padded, train_labels, test_padded, test_labels)