## **Mount Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers



In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Suicide_detection/Suicide_level_detection.csv')
df.head()

Unnamed: 0,text,class,age,level
0,Ex Wife Threatening SuicideRecently I left my ...,suicide,25,level2
1,i need helpjust help me im crying so hard,suicide,42,level2
2,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,20,level1
3,Honetly idkI dont know what im even doing here...,suicide,36,level3
4,[Trigger warning] Excuse for self inflicted bu...,suicide,39,level3


In [None]:
level_map = {'level1': 0, 'level2': 1, 'level3': 2}

# Applying the mapping to the 'class' column
df['level'] = df['level'].map(level_map)

df.sample(5)

Unnamed: 0,text,class,age,level
80660,"If something happens to my cat, I’m killing my...",suicide,48,0
51892,I have been having some thoughts...Is life rea...,suicide,17,1
65380,Everything is getting to be too much and I'm g...,suicide,23,0
53540,This point of no returnI just refused my new j...,suicide,35,2
8463,Scared of covid 19.I am surely not the only on...,suicide,39,2


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116037 entries, 0 to 116036
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    116037 non-null  object
 1   class   116037 non-null  object
 2   age     116037 non-null  int64 
 3   level   116037 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.5+ MB


### **Data Preparation**

In [None]:
df['level'].value_counts()

0    38902
1    38614
2    38521
Name: level, dtype: int64

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
token = tokenizer.encode_plus(
    df['text'].iloc[0],
    max_length=256,
    truncation=True,
    padding='max_length',
    add_special_tokens=True,
    return_tensors='tf'
)

In [None]:
token.input_ids

<tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101, 16409, 15256,   157,  8167, 13448,  4777, 25067,  2069,
        10294, 23680,   146,  1286,  1139,  1676,  1111,  1363,  1272,
         1131,  1144, 25695,  1113,  1143,  3059,  1105, 10118,  1106,
         1143,  1177,  1277,  1115,   146,  1138,  1879,  1106, 10250,
         1106,  1301,  1171,  1106,  1123,   119,  1249,  1104,   170,
         1374,  1552,  2403,   117,  1131,  1310,  8400,  5680,   119,
          146,  1138, 15269,  8709,  2097,  1292,   185, 22118,  1204,
         1374,  1552,  2520,  1123,  1149,  1104,  1122,  1105,  1131,
         7634,  1119,  5053, 24558,  1272,  1131,  3349,  1106,  2059,
          146,   112,  1325,  1435,  1171,   119,   146,  1221,   170,
         1974,  1104,  1234,  1209, 16757,  1142,  1107,  1546,  1106,
         1243,  1147,  1236,   117,  1133,  1184,  5940,  1191,  1131,
         1541,  1674,   136,  1327,  1202,   146,  1202,  1105,  1293,
         1821,   146,  3155, 

In [None]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [None]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [None]:
labels = np.zeros((len(df), 3))
labels.shape

(116037, 3)

In [None]:
labels[np.arange(len(df)), df['level'].values] = 1 # one-hot encoded target tensor

In [None]:
labels

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [None]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<_TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

In [None]:
def levelDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
dataset = dataset.map(levelDatasetMapFunction) # converting to required format for tensorflow dataset

In [None]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(3,), dtype=tf.float64, name=None))>

In [None]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [None]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 3), dtype=tf.float64, name=None))>

In [None]:
p = 0.8
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

In [None]:
train_size

5801

In [None]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

### **Model**

In [None]:
from transformers import TFBertModel

In [None]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

level_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
level_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1083102   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   72         'attention_mask[0][0]']      
                             hidden_state=(None, 256, 7                                     

In [None]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [None]:
level_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [47]:
hist = level_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

Epoch 1/2
Epoch 2/2


In [48]:
level_model.save('/content/drive/MyDrive/Suicide_detection/level_model')

### **Prediction**

In [49]:
level_model = tf.keras.models.load_model('/content/drive/MyDrive/Suicide_detection/level_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['level1', 'level2', 'level3']):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

In [50]:
input_text = input('Enter comments here: ')
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(level_model, processed_data=processed_data)
print(f"Predicted Level: {result}")

Enter comments here: Ex Wife Threatening SuicideRecently I left my wife for good because she has cheated on me twice and lied to me so much that I have decided to refuse to go back to her. As of a few days ago, she began threatening suicide. I have tirelessly spent these paat few days talking her out of it and she keeps hesitating because she wants to believe I'll come back. I know a lot of people will threaten this in order to get their way, but what happens if she really does? What do I do and how am I supposed to handle her death on my hands? I still love my wife but I cannot deal with getting cheated on again and constantly feeling insecure. I'm worried today may be the day she does it and I hope so much it doesn't happen.
Predicted Level: level3
