## Paper:
2021, Science Direct,
An ensemble deep learning technique for detecting suicidal ideation from posts in social media platforms;

Dataset: Twitter

In [None]:
!pip install tensorflow

In [None]:
!pip install keras

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.layers import *
from keras.models import *
from nltk.tokenize import word_tokenize
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import keras.backend as K
from keras.layers import Layer
import gensim

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings(action = 'ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Preprocessing function
def preprocessing(data_frame):
    ## Preprocessing
    # Removing URLs whithin the tweets
    data_frame["Text"] = data_frame["Text"].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()
    # Removing emails, hashtags and punctuations
    data_frame['Text'] = data_frame["Text"].str.replace(r'\S*@\S*\s?', ' ').str.strip()
    data_frame['Text'] = data_frame['Text'].str.replace(r'#\S*\s?', ' ').str.strip()
    data_frame['Text'] = data_frame['Text'].str.replace(r'[^\w\s]+', ' ').str.strip()

    # Lowercase Text
    data_frame['Text'] = data_frame['Text'].str.lower()

    # # Removing stopwords
    stop = stopwords.words('english')
    data_frame['Text'].apply(lambda x: [item for item in str(x) if item not in stop])

    # Removing newline characters
    data_frame['Text'] = data_frame['Text'].str.rstrip()

    # Tokenizing Posts and counting the length of each post
    data_frame['Tokens'] = data_frame.apply(lambda row: word_tokenize(str(row['Text'])), axis=1)
    data_frame['Length'] = data_frame.apply(lambda row: len(row['Tokens']), axis=1)

    return data_frame

In [None]:
# a custome Attention layer
class Attention(Layer):
  def __init__ (self, return_sequences=True):
    self.return_sequences = return_sequences
    super(Attention, self).__init__()

  def build (self, input_shape):
    self.W = self.add_weight(name="att_weight", shape=(input_shape[-1],1), initializer="normal")
    self.b = self.add_weight(name="att_bias", shape=(input_shape[1],1), initializer="zeros")
    super(Attention, self).build(input_shape)

  def call(self, x):
    e = K.tanh(K.dot(x, self.W)+self.b)
    a = K.softmax(e, axis=1)
    output = x*a
    if self.return_sequences:
      return output
    return K.sum(output, axis = 1)


In [None]:
## Preparing the data
# Twitter 56000
Twitter_path = "/Datasets/Twitter/suicideTweetData_56000.csv"  ## positive samples
facebook_samples = "/Datasets/Facebook/100_000_samples.csv"  ## negative samples

## reading the abovementioned datasets
df_t = pd.read_csv(Twitter_path, encoding='latin-1')
df_f = pd.read_csv(facebook_samples, encoding='latin-1', usecols=['message'], nrows=len(df_t['Tweet']))

## Preprocessing
messages_list = []
for row in range(len(df_t['Tweet'])):
    messages_list.append([df_t['Tweet'][row], 1])
for row in range(len(df_f['message'])):
    messages_list.append([df_f['message'][row], 0])
del df_f, df_t
df = pd.DataFrame(messages_list, columns=['Text', 'Label'])
df = preprocessing(df)
df

Unnamed: 0,Text,Label,Tokens,Length
0,always made me feel worthless but all of a sud...,1,"[always, made, me, feel, worthless, but, all, ...",14
1,hope i die in my sleep tonite,1,"[hope, i, die, in, my, sleep, tonite]",7
2,my life sucks just want to give up and cry,1,"[my, life, sucks, just, want, to, give, up, an...",10
3,breed of queensland male who shouldâ ve alread...,1,"[breed, of, queensland, male, who, shouldâ, ve...",49
4,closer to killing myself everyday famsquad ð ...,1,"[closer, to, killing, myself, everyday, famsqu...",16
...,...,...,...,...
105227,once in a lifetime experience,0,"[once, in, a, lifetime, experience]",5
105228,love coldplay and chris martin,0,"[love, coldplay, and, chris, martin]",5
105229,chris martin is the bomb,0,"[chris, martin, is, the, bomb]",5
105230,love coldplay,0,"[love, coldplay]",2


In [None]:
print(list(df['Label']).count(1), list(df['Label']).count(0))

52616 52616


# Word2Vec model

In [None]:
model_path = "/w2v_models/twitter10000/w2v_model_T10"
# word embeddings
# word2vec embedding
wd2vc_model = gensim.models.Word2Vec(df['Tokens'], min_count = 1, size = 300, window = 5)
wd2vc_model.train(df['Tokens'],epochs=10,total_examples=len(df['Tokens']))
wd2vc_model.save(model_path)

In [None]:
# model_path = "/w2v_models/twitter_facebook_CBOW/w2v_models_TF"
# wd2vc_model = Word2Vec.load(model_path)

In [None]:
# words most similar to a given word.
similar_words = wd2vc_model.wv.most_similar('kill')
print(similar_words)

In [None]:
## Now creating a dictionary with words in vocab and their embeddings.
# This will be used when we will be creating embedding matrix (for feeding to keras embedding layer).

vocab = wd2vc_model.wv.vocab
print("The total number of words are : ", len(vocab))
vocab = list(vocab.keys())

word_vec_dict = {}
for word in vocab:
    word_vec_dict[word] = wd2vc_model.wv.get_vector(word)
print("The no of key-value pairs : ", len(word_vec_dict))  # should come equal to vocab size

max_length = max(df['Length'])
print("maximum length = ", max_length)

tok = Tokenizer()
tok.fit_on_texts(df['Tokens'])
vocab_size = len(tok.word_index) + 1
# Keras padding doesn't work with strings; all characters should be mapped to id (integers)
encd_rev = tok.texts_to_sequences(df['Tokens'])

embed_dim = 300
# now padding to have a amximum length of 6671
pad_rev = pad_sequences(encd_rev, maxlen=max_length, padding='post')
print("pad_shape = ", pad_rev.shape)
# note that we had 100K reviews and we have padded each review to have  a lenght of 1565 words.

# now creating the embedding matrix
embed_matrix = np.zeros(shape=(vocab_size, embed_dim))
print(embed_matrix.shape)
for word, i in tok.word_index.items():
    embed_vector = word_vec_dict.get(word)
    if embed_vector is not None:  # word is in the vocabulary learned by the w2v model
        embed_matrix[i] = embed_vector
        # if word is not found then embed_vector corressponding to that vector will stay zero.


In [None]:
# from sklearn import preprocessing
# pad_rev = preprocessing.normalize(pad_rev)
# pad_rev

In [None]:
## Spliting the data
X, X_test, y, y_test = train_test_split(pad_rev, df['Label'], shuffle=True, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=33)

print("Train = ", X_train.shape, y_train.shape)
print("Test = ", X_test.shape, y_test.shape)
print("Validation = ",X_val.shape, y_val.shape)


Train =  (67348, 1355) (67348,)
Test =  (21047, 1355) (21047,)
Validation =  (16837, 1355) (16837,)


# Implemented model

In [None]:
from keras.initializers import Constant

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_length, embeddings_initializer=Constant(embed_matrix)))
# model.add(tf.keras.layers.Input(shape=X_train.shape[1]))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.LSTM(units=100, return_sequences=True))
model.add(Attention(return_sequences=True))
model.add(tf.keras.layers.Conv1D(3, (8,), padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(2))
model.add(tf.keras.layers.Flatten())
# model.add(Dense(4, activation='softmax')) ##4labels
model.add(tf.keras.layers.Dense(2, activation='softmax'))  ##2 labels

model.summary()

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# specify batch size and epochs for training.
epochs = 10
batch_size = 32
r = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_val, y_val))

## Plotting the performance

In [None]:
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()

In [None]:
plt.plot(r.history['accuracy'], label='accuracy')
plt.plot(r.history['val_accuracy'], label='val_accuracy')
plt.legend()

## Prediction

In [None]:
#Prediction on the test data
pred = model.predict(X_test)
print(pred)

In [None]:
y_pred = np.zeros(len(pred))
for item in range(len(pred)):
    y_pred[item] = list(pred[item]).index(max(pred[item]))

print(y_pred)

In [None]:
import sklearn

report = sklearn.metrics.classification_report(y_test, y_pred)
print(report)
accuracy = sklearn.metrics.accuracy_score(y_test, y_pred)
precision = sklearn.metrics.precision_score(y_test, y_pred)
recall = sklearn.metrics.recall_score(y_test, y_pred)
f1score = sklearn.metrics.f1_score(y_test, y_pred)
print("-----------------------")
print(accuracy)
print(precision)
print(recall)
print(f1score)