In [1]:
import pandas as pd
import numpy as np
import gensim
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras.layers import Dense, Flatten, MaxPooling1D, Dropout, Conv1D, Input, LSTM, Bidirectional 

In [2]:
df = pd.read_csv('combined-selftext.csv')
#df.head()

In [3]:
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]:
df['text'] = str_join(df," ", 'title', 'usertext')

In [5]:
df['text'] = str_join(df," ", 'title', 'usertext')
del df['title']
del df['usertext']
df.rename(columns = {'y':'is_suicide'}, inplace = True)

In [6]:
df['text_clean'] = df['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['text_clean'], df['is_suicide'], test_size=0.2)

In [8]:
w2v_model = gensim.models.Word2Vec(df['text_clean'],
                                   vector_size=300,
                                   epochs=20,
                                   window=10,
                                   min_count=70)

In [9]:
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [10]:
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(300, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(300, dtype=float))

In [11]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [12]:
X_train_vect_avg = np.array(X_train_vect_avg)
X_test_vect_avg = np.array(X_test_vect_avg)

In [13]:
print(X_train_vect_avg.shape, X_test_vect_avg.shape)

(1498, 300) (375, 300)


In [14]:
print(y_train.shape, y_test.shape)

(1498,) (375,)


In [15]:
#LSTM

model10 = keras.Sequential()
model10.add(keras.layers.Input(shape=(300, 1)))
model10.add(keras.layers.LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model10.add(keras.layers.MaxPooling1D(pool_size = 2))
model10.add(keras.layers.Flatten())
model10.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model10.add(keras.layers.Dense(1, activation='sigmoid'))
model10.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=1)])
model10.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 300, 100)          40800     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 150, 100)         0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 15000)             0         
                                                                 
 dense (Dense)               (None, 64)                960064    
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 1,000,929
Trainable params: 1,000,929
Non-trainable params: 0
______________________________________________

In [18]:
history11 = model10.fit(
    X_train_vect_avg, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model10.evaluate(X_test_vect_avg, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.5877171754837036,
 0.7066666483879089,
 0.7441860437393188,
 0.6597937941551208,
 array([0.6818981], dtype=float32)]

In [16]:
#BiLSTM

model3 = keras.Sequential()

model3.add(keras.layers.Input(shape=(300, 1)))
model3.add(keras.layers.Bidirectional(LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
model3.add(keras.layers.MaxPooling1D(pool_size = 2))
model3.add(keras.layers.Flatten())
model3.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model3.add(keras.layers.Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=1)])
model3.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (None, 300, 200)         81600     
 l)                                                              
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 150, 200)         0         
 1D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 30000)             0         
                                                                 
 dense_2 (Dense)             (None, 64)                1920064   
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,001,729
Trainable params: 2,001,729
No

In [29]:
history3 = model3.fit(
    X_train_vect_avg, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model3.evaluate(X_test_vect_avg, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6050381064414978,
 0.6613333225250244,
 0.676616907119751,
 0.6868686676025391,
 array([0.69109946], dtype=float32)]

In [17]:
#LSTM + CNN

model10 = keras.Sequential()

model10.add(keras.layers.Input(shape=(300, 1)))
model10.add(keras.layers.LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5))
model10.add(keras.layers.Conv1D(5, (2,), padding='same', activation='relu'))
model10.add(keras.layers.MaxPooling1D(pool_size = 2))
model10.add(keras.layers.Flatten())
model10.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model10.add(keras.layers.Dense(1, activation='sigmoid'))
model10.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=1)])
model10.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 300, 100)          40800     
                                                                 
 conv1d (Conv1D)             (None, 300, 5)            1005      
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 150, 5)           0         
 1D)                                                             
                                                                 
 flatten_2 (Flatten)         (None, 750)               0         
                                                                 
 dense_4 (Dense)             (None, 64)                48064     
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                      

In [23]:
history10 = model10.fit(
    X_train_vect_avg, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model10.evaluate(X_test_vect_avg, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6665661334991455,
 0.6240000128746033,
 0.6059479713439941,
 0.8232323527336121,
 array([0.69109946], dtype=float32)]

In [18]:
#BiLSTM + CNN

model6 = keras.Sequential()

model6.add(keras.layers.Input(shape=(300, 1)))
model6.add(keras.layers.Bidirectional(LSTM(100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)))
model6.add(keras.layers.Conv1D(5, (2,), padding='same', activation='relu'))
model6.add(keras.layers.MaxPooling1D(pool_size = 2))
model6.add(keras.layers.Flatten())
model6.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model6.add(keras.layers.Dense(1, activation='sigmoid'))
model6.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=1)])
model6.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_1 (Bidirectio  (None, 300, 200)         81600     
 nal)                                                            
                                                                 
 conv1d_1 (Conv1D)           (None, 300, 5)            2005      
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 150, 5)           0         
 1D)                                                             
                                                                 
 flatten_3 (Flatten)         (None, 750)               0         
                                                                 
 dense_6 (Dense)             (None, 64)                48064     
                                                                 
 dense_7 (Dense)             (None, 1)                

In [31]:
history11 = model6.fit(
    X_train_vect_avg, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model6.evaluate(X_test_vect_avg, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.6271308660507202,
 0.6639999747276306,
 0.6800000071525574,
 0.6868686676025391,
 array([0.69109946], dtype=float32)]