In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tqdm import tqdm
import tensorflow_hub as hub
import tensorflow_text

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('combined-selftext.csv')
#df.head()

In [3]:
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]:
df['text'] = str_join(df," ", 'title', 'usertext')

In [5]:
del df['title']
del df['usertext']

In [6]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
STOPWORDS = STOPWORDS.union(set(['im', 'ive', 'ill', 'wa', 'ha', 'aint', 'thats', 'la', 'le', 'please', 'feel', 'rly', 'u', 'nan', 'emptypost']))

stop = STOPWORDS
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [8]:
df["is_suicide"] = df["y"].apply(lambda x: "depressed" if x < 1 else "suicidal")

In [9]:
suicidal_reddits = df[df.is_suicide == "suicidal"]
depressed_reddits = df[df.is_suicide == "depressed"]

In [10]:
#suicidal_df = suicidal_reddits.sample(n=len(depressed_reddits), random_state=RANDOM_SEED)
suicidal_df = suicidal_reddits
depressed_df = depressed_reddits

In [11]:
reddits_df = (pd.concat([suicidal_df, depressed_df]))

In [None]:
# importing the "tarfile" module
import tarfile

# open file
file = tarfile.open('universal-sentence-encoder-multilingual-large_3.tar.gz')

# extracting file
file.extractall("C:\\Users\\user\\SD\\GUSE\GUSE3")

file.close()

In [12]:
use = hub.load("C:\\Users\\user\\SD\\GUSE\\GUSE3")

In [13]:
from sklearn.preprocessing import OneHotEncoder

type_one_hot = OneHotEncoder(sparse=False).fit_transform(
  reddits_df.is_suicide.to_numpy().reshape(-1, 1)
)

In [14]:
train_reddits, test_reddits, y_train, y_test =\
  train_test_split(
    reddits_df.text, 
    type_one_hot, 
    test_size=.2, 
    random_state=RANDOM_SEED
  )

In [15]:
X_train = []
for r in tqdm(train_reddits):
  emb = use(r)
  reddit_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(reddit_emb)

X_train = np.array(X_train)

100%|██████████| 1498/1498 [01:33<00:00, 16.00it/s]


In [16]:
X_test = []
for r in tqdm(test_reddits):
  emb = use(r)
  reddit_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(reddit_emb)

X_test = np.array(X_test)

100%|██████████| 375/375 [00:23<00:00, 16.00it/s]


In [17]:
print(X_train.shape, X_test.shape)

(1498, 512) (375, 512)


In [18]:
print(y_train.shape, y_test.shape)

(1498, 2) (375, 2)


In [19]:
from tensorflow.keras.layers import Dense, Flatten, MaxPooling1D, Dropout, Conv1D, Input, LSTM, Bidirectional

In [25]:
#BiLSTM 100 + CNN

model = keras.Sequential()

model.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model.add(keras.layers.Bidirectional(LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
model.add(keras.layers.Conv1D(5, (2,), padding='same', activation='relu'))
model.add(keras.layers.MaxPooling1D(pool_size = 2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(keras.layers.Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(),
                              metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=2)])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_1 (Bidirectio  (None, 512, 200)         81600     
 nal)                                                            
                                                                 
 conv1d_1 (Conv1D)           (None, 512, 5)            2005      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 256, 5)           0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 1280)              0         
                                                                 
 dense_4 (Dense)             (None, 10)                12810     
                                                                 
 dense_5 (Dense)             (None, 2)                

In [26]:
history = model5.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model5.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6622888445854187,
 0.6426666378974915,
 0.5563636422157288,
 0.8159999847412109,
 array([0.51094896, 0.7184874 ], dtype=float32)]

In [29]:
#BiLSTM 100

model = keras.Sequential()

model.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model.add(keras.layers.Bidirectional(LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
model.add(keras.layers.MaxPooling1D(pool_size = 2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(keras.layers.Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=2)])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_3 (Bidirectio  (None, 512, 200)         81600     
 nal)                                                            
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 256, 200)         0         
 1D)                                                             
                                                                 
 flatten_3 (Flatten)         (None, 51200)             0         
                                                                 
 dense_8 (Dense)             (None, 10)                512010    
                                                                 
 dense_9 (Dense)             (None, 2)                 22        
                                                                 
Total params: 593,632
Trainable params: 593,632
Non-tr

In [30]:
history = model5.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model5.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.5925047397613525,
 0.690666675567627,
 0.6900269389152527,
 0.6826666593551636,
 array([0.65680474, 0.7184467 ], dtype=float32)]

In [35]:
#LSTM 100 + CNN

from tensorflow.keras.layers import Dense, Flatten, MaxPooling1D, Dropout, Conv1D, LSTM, Input

model = keras.Sequential()

model.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model.add(keras.layers.LSTM(100, dropout=0.5, recurrent_dropout=0.5)))
model.add(keras.layers.Conv1D(5, (2,), padding='same', activation='relu'))
model.add(keras.layers.MaxPooling1D(pool_size = 2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(keras.layers.Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), 
                              metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=2)])
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_8 (LSTM)               (None, 512, 100)          40800     
                                                                 
 conv1d_4 (Conv1D)           (None, 512, 5)            1005      
                                                                 
 max_pooling1d_5 (MaxPooling  (None, 256, 5)           0         
 1D)                                                             
                                                                 
 flatten_5 (Flatten)         (None, 1280)              0         
                                                                 
 dense_12 (Dense)            (None, 10)                12810     
                                                                 
 dense_13 (Dense)            (None, 2)                 22        
                                                      

In [36]:
history = model10.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model10.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.650529146194458,
 0.6639999747276306,
 0.6777777671813965,
 0.6506666541099548,
 array([0.6337209, 0.6896551], dtype=float32)]

In [38]:
#LSTM 100

model = keras.Sequential()
model.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model.add(keras.layers.LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model.add(keras.layers.MaxPooling1D(pool_size = 2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(keras.layers.Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=2)])
model10.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_10 (LSTM)              (None, 512, 100)          40800     
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 256, 100)         0         
 1D)                                                             
                                                                 
 flatten_6 (Flatten)         (None, 25600)             0         
                                                                 
 dense_14 (Dense)            (None, 10)                256010    
                                                                 
 dense_15 (Dense)            (None, 2)                 22        
                                                                 
Total params: 296,832
Trainable params: 296,832
Non-trainable params: 0
_______________________________________________

In [39]:
history = model10.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model10.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.603188157081604,
 0.6693333387374878,
 0.6581632494926453,
 0.6880000233650208,
 array([0.63742685, 0.6960784 ], dtype=float32)]