In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/suicidal-mental-health-dataset/mental-health.csv
/kaggle/input/f_net/keras/f_net_base_en/2/config.json
/kaggle/input/f_net/keras/f_net_base_en/2/tokenizer.json
/kaggle/input/f_net/keras/f_net_base_en/2/metadata.json
/kaggle/input/f_net/keras/f_net_base_en/2/model.weights.h5
/kaggle/input/f_net/keras/f_net_base_en/2/assets/tokenizer/vocabulary.spm


In [100]:
import tensorflow as tf
import keras
from keras import layers, optimizers
from keras_hub.models import FNetTokenizer
from keras_hub.layers import TokenAndPositionEmbedding
from keras_hub.layers import FNetEncoder

## Dataset: EDA and Cleaning

In [3]:
df = pd.read_csv('/kaggle/input/suicidal-mental-health-dataset/mental-health.csv')
df.head()

Unnamed: 0,text,label
0,I recently went through a breakup and she said...,depression
1,"I do not know how to navigate these feelings, ...",depression
2,"So I have been with my bf for 5 months , and h...",depression
3,I am so exhausted of this. Just when I think I...,SuicideWatch
4,I have been severly bullied since i was 5 till...,depression


In [4]:
df['label'] = df['label'].str.replace('SuicideWatch', 'suicidal')
df['label'] = df['label'].str.lower()

In [5]:
binarized = []
for label in df['label']:
    if label == 'suicidal':
        binarized.append(1)
    else:
        binarized.append(0)

binarized = pd.Series(binarized, name='label')
df = df.drop('label', axis=1)
df = pd.concat([df, binarized], axis=1)

In [6]:
df.isna().any().any()

False

In [7]:
df.head()

Unnamed: 0,text,label
0,I recently went through a breakup and she said...,0
1,"I do not know how to navigate these feelings, ...",0
2,"So I have been with my bf for 5 months , and h...",0
3,I am so exhausted of this. Just when I think I...,1
4,I have been severly bullied since i was 5 till...,0


### Split into Train/Val/Test Datasets

In [8]:
from sklearn.model_selection import train_test_split


In [69]:
train_df, other = train_test_split(
    df,
    test_size = 0.1,
    random_state = 9500,
    shuffle = True,
    stratify = df['label'],
)

In [70]:
val_df, test_df = train_test_split(
    other,
    test_size = 0.5,
    random_state = 9500,
    shuffle = True,
    stratify = other['label'],
)

In [71]:
train_df['text'].apply(lambda x: len(x.split(" "))).describe()


count    18327.000000
mean       166.967043
std        192.903055
min          1.000000
25%         56.000000
50%        110.000000
75%        211.000000
max       6300.000000
Name: text, dtype: float64

Convert to TensorFlow Dataset

In [72]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (train_df['text'].values, train_df['label'].values)
)

In [73]:
val_dataset = tf.data.Dataset.from_tensor_slices(
    (val_df['text'].values, val_df['label'].values)
)

In [74]:
test_dataset = tf.data.Dataset.from_tensor_slices(
    (test_df['text'].values, test_df['label'].values)
)

In [75]:
train_dataset = train_dataset.map(lambda text, label: (tf.strings.lower(text), label))
val_dataset = val_dataset.map(lambda text, label: (tf.strings.lower(text), label))
test_dataset = test_dataset.map(lambda text, label: (tf.strings.lower(text), label))

In [76]:
BATCH_SIZE = 64
auto = tf.data.AUTOTUNE

In [77]:
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(auto)
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(auto)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(auto)

In [78]:
for text_batch, label_batch in train_dataset.take(1):
    for i in range(3):
        print(f"{text_batch.numpy()[i]}\n{label_batch.numpy()[i]}\n")

b'when you are talking to someone and know what they are saying is bs but you can not do anything about it because you are getting so confused by what they are saying so your mind goes in every direction and you are just standing there like "are you kidding" and then slowly drift into self-doubt and just give up.anyone knows that feeling? gaslighting seems worse when when you know it is happening'
0

b'my future looks bleak and my entire life has been a sick joke and right now i am in so much pain. all i am good at is self harming and pushing people away and suppressing my emotions. i need help but i am irreparably fucked up. i only feel good when i hurt myself really badly self destruct until i die'
1

b'i have been contemplating suicide more and more every day for years, and specifically over the last few months those thoughts have been escalating to where its not a matter of if but when. my method in which i plan to go out death by gun. i have a 9mm pistol already. i went out into a

## Tokenize the Text Dataset

In [79]:
tokenizer = FNetTokenizer.from_preset('f_net_base_en')

Test the tokenizer on some random sentences

In [80]:
input_sentence_ex = train_dataset.take(1).get_single_element()[0][0]
input_tokens_ex = tokenizer(input_sentence_ex)

print("Sentence: ", input_sentence_ex)
print("Tokens: ", input_tokens_ex)
print("Recovered text after detokenizing: ", tokenizer.detokenize(input_tokens_ex))

Sentence:  tf.Tensor(b'when you are talking to someone and know what they are saying is bs but you can not do anything about it because you are getting so confused by what they are saying so your mind goes in every direction and you are just standing there like "are you kidding" and then slowly drift into self-doubt and just give up.anyone knows that feeling? gaslighting seems worse when when you know it is happening', shape=(), dtype=string)
Tokens:  tf.Tensor(
[  381    60   108  3998    33  1874    36   465   395   269   108  3674
    65    28 16664   243    60   161   185   202  2038   292    96   661
    60   108  1503   215 11039   181   395   269   108  3674   215   135
  1517  2841    38   488  4086    36    60   108   408  5676   375   347
   369   250    60  5024  1412 16719    36   579  6909  1679  1566   416
  2030 16688 16668 11913 16659    36   408  1061   249 16678   549   258
  4599    88  3267 16724  3214  1729    29  2491  6144   381   381    60
   465    96    65  62

Tokenizer the Train/Val/test Datasets

In [81]:
def format_dataset(sentence, label):
    sentence = tokenizer(sentence)
    return ({"input_ids": sentence}, label)

In [82]:
def make_dataset(dataset):
    dataset = dataset.map(format_dataset, num_parallel_calls=auto)
    return dataset.shuffle(BATCH_SIZE * 8).prefetch(16).cache()

In [83]:
train_dataset = make_dataset(train_dataset)

In [84]:
val_dataset = make_dataset(val_dataset)

In [85]:
test_dataset = make_dataset(test_dataset)

In [86]:
print(f"Vocabulary size, {tokenizer.vocabulary_size()}")

Vocabulary size, 32000


## 12-Layer FNet Model

In [109]:
def FentSmoker():
    inputs = layers.Input(shape=(None,), dtype="int64", name="input_ids")
    x = TokenAndPositionEmbedding(
        vocabulary_size=tokenizer.vocabulary_size(),
        sequence_length=256,
        embedding_dim=128,
        mask_zero=True,
    )(inputs)
    x = FNetEncoder(512)(x)
    x = FNetEncoder(512)(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.1)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)

    return keras.Model(inputs, outputs, name="FentSmoker9500")

In [110]:
FS9500 = FentSmoker()



In [111]:
FS9500.summary()

In [112]:
FS9500.compile(
    optimizer = optimizers.Adam(learning_rate=5e-3),
    loss = "binary_crossentropy",
    metrics = ["accuracy"]
)

In [113]:
callbacks = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3,
    verbose=1,
    mode="auto",
    restore_best_weights=True,
    start_from_epoch=2,
)

In [114]:
history = FS9500.fit(train_dataset, epochs=10, validation_data=val_dataset, callbacks=[callbacks])


Epoch 1/10


TypeError: Exception encountered when calling PositionEmbedding.call().

[1mExpected int32 passed to parameter 'size' of op 'Slice', got (None, 128) of type 'tuple' instead. Error: Expected int32, but got None of type 'NoneType'.[0m

Arguments received by PositionEmbedding.call():
  • inputs=tf.Tensor(shape=(None, None, 128), dtype=float32)
  • start_index=0

In [123]:
train_dataset.element_spec

({'input_ids': RaggedTensorSpec(TensorShape([None, None]), tf.int32, 1, tf.int64)},
 TensorSpec(shape=(None,), dtype=tf.int64, name=None))