In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
import neattext.functions as nfx
import plotly.express as plx
from sklearn.metrics import classification_report
import keras
from keras.layers import Embedding,Dense,LSTM,GlobalMaxPooling1D,Input
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.models import Sequential
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm




In [2]:
data=pd.read_csv('Suicide_Detection1.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [3]:
data = data.dropna(subset=['class'])

In [4]:
data['class'].value_counts()

class
suicide        116033
non-suicide    116012
Name: count, dtype: int64

In [5]:
data['class'].value_counts().index.values

array(['suicide', 'non-suicide'], dtype=object)

In [6]:
train_data,test_data=train_test_split(data,test_size=0.2,random_state=10)

In [7]:
train_data['class'].value_counts().index.values

array(['suicide', 'non-suicide'], dtype=object)

# Data Visualisation

In [8]:
plx.bar(x=train_data['class'].value_counts().index.values,
        y=train_data['class'].value_counts(),color=['Suicide','Not Suicide'],
        labels={'x': 'Class', 'y': 'Count'},
        title='Distribution of Classes in Training Data')

# Data Cleaning

In [9]:
def clean_text(text):
    text_length=[]
    cleaned_text=[]
    for sent in tqdm(text):
        if isinstance(sent, str):
            sent=sent.lower()
            sent=nfx.remove_special_characters(sent)
            sent=nfx.remove_stopwords(sent)
            text_length.append(len(sent.split()))
            cleaned_text.append(sent)
        else:
            text_length.append(0)
            cleaned_text.append("")
    return cleaned_text,text_length


In [10]:
cleaned_train_text,train_text_length=clean_text(train_data.text)
cleaned_test_text,test_text_length=clean_text(test_data.text)

100%|██████████| 185636/185636 [00:10<00:00, 18270.35it/s]
100%|██████████| 46409/46409 [00:02<00:00, 23090.85it/s]


In [11]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(cleaned_train_text)

In [12]:
cleaned_train_text

['probably going jump bridge sooni dont know read post history cant longer feel angry helpless',
 'ate 1 hour stopping 1 hour ago hungry toast kept making hungry ate half thing looked food found half pack cereal',
 'saw counselor 4 june 2018 lot interesting things happenedi went counselor 4 june 2018 entered office able switch earphones didnt feel frightened yeahi told counselor medications changed caused tired easily breathless dont strength normal things daily routine dizzy sleepy tried concentrate drivingi told counselor recently tried blogging love blogging yeah hope small income blogging pi met stranger online reddit happened live singapore exchanged handphone numbers decided hiking coming thursday 7 june 2018 ask counselor help find topics talk yeah mild autism good socialrelated thingsin counseling session called case manager institute mental health imh singapore told effects new medication taking decided come outpatient clinic imh day inquire changing medicationin counseling se

In [13]:
train_text_seq=tokenizer.texts_to_sequences(cleaned_train_text)
train_text_pad=pad_sequences(train_text_seq,maxlen=40)


test_text_seq=tokenizer.texts_to_sequences(cleaned_test_text)
test_text_pad=pad_sequences(test_text_seq,maxlen=40)

In [14]:
train_text_pad

array([[    0,     0,     0, ...,     6,   368,  1975],
       [    0,     0,     0, ...,   285,  2385,  5575],
       [   10,    67,  2110, ...,   143,   123,    16],
       ...,
       [    7,    42,  1367, ..., 58112,   214, 73686],
       [  503,     8,  1559, ...,   686,  1511,   265],
       [    0,     0,     0, ...,  3336,  4487,   550]])

# Glove Embeddings

In [15]:
lbl_target=LabelEncoder()
train_output=lbl_target.fit_transform(train_data['class'])
test_output=lbl_target.transform(test_data['class'])

In [16]:
import pickle
with open('glove.840B.300d.pkl', 'rb') as fp:
    glove_embedding = pickle.load(fp)

In [17]:
v=len(tokenizer.word_index)

embedding_matrix=np.zeros((v+1,300), dtype=float)
for word,idx in tokenizer.word_index.items():
    embedding_vector=glove_embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

In [18]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.074482  ,  0.58293003, -0.78233999, ..., -0.24984001,
        -0.096953  ,  0.66692001],
       [-0.35394999,  0.23051   , -0.62689   , ..., -0.20720001,
         0.52003002,  0.51129001],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [19]:
early_stop=EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reducelr=ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

# Keras Sequential Model Construction

In [20]:
model=Sequential()
model.add(Input(shape=(40,)))
model.add(Embedding(v+1,300,weights=[embedding_matrix],trainable=False))
model.add(LSTM(20,return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(256,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer=keras.optimizers.SGD(0.1,momentum=0.09),loss='binary_crossentropy',metrics=['accuracy'])




In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 300)           80627100  
                                                                 
 lstm (LSTM)                 (None, 40, 20)            25680     
                                                                 
 global_max_pooling1d (Glob  (None, 20)                0         
 alMaxPooling1D)                                                 
                                                                 
 dense (Dense)               (None, 256)               5376      
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 80658413 (307.69 MB)
Trainable params: 31313 (122.32 KB)
Non-trainable params: 80627100 (307.57 MB)
________

# Model Training and Evaluation

In [22]:
r=model.fit(train_text_pad,train_output,validation_data=(test_text_pad,test_output),
            epochs=20,batch_size=256,callbacks=[early_stop,reducelr])

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


In [23]:
print('TESTING DATA CLASSIFICATION REPORT \n \n')
y_pred = (model.predict(test_text_pad) > 0.5).astype("int32")
print(classification_report(test_output,y_pred,
                            target_names=lbl_target.inverse_transform([0,1])))

print('TRAINING DATA CLASSIFICATION REPORT \n \n')
print(classification_report(train_output,(model.predict(train_text_pad)>0.5).astype("int32"),
                           target_names=lbl_target.inverse_transform([0,1])))

TESTING DATA CLASSIFICATION REPORT 
 

              precision    recall  f1-score   support

 non-suicide       0.92      0.93      0.92     23198
     suicide       0.93      0.92      0.92     23211

    accuracy                           0.92     46409
   macro avg       0.92      0.92      0.92     46409
weighted avg       0.92      0.92      0.92     46409

TRAINING DATA CLASSIFICATION REPORT 
 

              precision    recall  f1-score   support

 non-suicide       0.93      0.92      0.93     92814
     suicide       0.93      0.94      0.93     92822

    accuracy                           0.93    185636
   macro avg       0.93      0.93      0.93    185636
weighted avg       0.93      0.93      0.93    185636



In [32]:
twt = ['i am not feeling well and i want to be left alone and all my life is a waste']
twt = tokenizer.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=50)

prediction = model.predict(twt)[0][0]
print(prediction)

if(prediction > 0.5):
    print("Potential Suicide Post")
else:
    print("Non Suicide Post")

0.8569623
Potential Suicide Post


In [25]:
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [26]:
model.save("model.h5")


You are saving your model as an HDF5 file via `model.save()`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')`.



In [27]:
token_form = pickle.load(open('tokenizer.pkl', 'rb'))

In [28]:
from keras.models import load_model

In [29]:
model_form = load_model("model.h5")

In [37]:
twt = ["i am an engineering student but have no job and certainly no money and i think i am not worth of anything in my life as i am a drug addict and i dont have money to buy drugs but i wish that my life was any better "]
twt = token_form.texts_to_sequences(twt)
twt = pad_sequences(twt, maxlen=40)


prediction = model_form.predict(twt)[0][0]
print(f"Prediction score: {prediction:.4f}")

if(prediction > 0.5):
    print(f"Potential Suicide Post (Confidence: {prediction:.4f})")
else:
    print(f"Non Suicide Post (Confidence: {prediction:.4f})")

Prediction score: 0.8021
Potential Suicide Post (Confidence: 0.8021)
