In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tqdm import tqdm
import tensorflow_hub as hub
import tensorflow_text

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

In [2]:
df = pd.read_csv('combined-selftext.csv')
#df.head()

In [3]:
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]:
df['text'] = str_join(df," ", 'title', 'usertext')

In [5]:
del df['title']
del df['usertext']

In [6]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
STOPWORDS = STOPWORDS.union(set(['im', 'ive', 'ill', 'wa', 'ha', 'aint', 'thats', 'la', 'le', 'please', 'feel', 'rly', 'u', 'nan', 'emptypost']))

stop = STOPWORDS
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [11]:
df["is_suicide"] = df["y"].apply(lambda x: "depressed" if x < 1 else "suicidal")

In [13]:
suicidal_reddits = df[df.is_suicide == "suicidal"]
depressed_reddits = df[df.is_suicide == "depressed"]

In [19]:
#suicidal_df = suicidal_reddits.sample(n=len(depressed_reddits), random_state=RANDOM_SEED)
suicidal_df = suicidal_reddits
depressed_df = depressed_reddits

In [20]:
reddits_df = (pd.concat([suicidal_df, depressed_df]))

In [21]:
reddits_df.head()

Unnamed: 0,y,text,is_suicide
1,1,feeling overwhelmed hopeless depressed past co...,suicidal
3,1,tired hearing bullshit shit like better purpos...,suicidal
5,1,talk suicide weird question someplace talk wan...,suicidal
8,1,suicide note know pas know people know care wa...,suicidal
11,1,read talk year old girlfrend year talking gett...,suicidal


In [22]:
reddits_df.shape

(1873, 3)

In [24]:
# importing the "tarfile" module
#import tarfile

# open file
#file = tarfile.open('universal-sentence-encoder-multilingual-large_3.tar.gz')

# extracting file
#file.extractall("C:\\Users\\user\\SD\\GUSE\GUSE3")

#file.close()

In [25]:
use = hub.load("C:\\Users\\user\\SD\\GUSE\\GUSE3")

In [26]:
from sklearn.preprocessing import OneHotEncoder

type_one_hot = OneHotEncoder(sparse=False).fit_transform(
  reddits_df.is_suicide.to_numpy().reshape(-1, 1)
)

In [27]:
train_reddits, test_reddits, y_train, y_test =\
  train_test_split(
    reddits_df.text, 
    type_one_hot, 
    test_size=.2, 
    random_state=RANDOM_SEED
  )

In [28]:
X_train = []
for r in tqdm(train_reddits):
  emb = use(r)
  reddit_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(reddit_emb)

X_train = np.array(X_train)

100%|██████████| 1498/1498 [01:40<00:00, 14.94it/s]


In [29]:
X_test = []
for r in tqdm(test_reddits):
  emb = use(r)
  reddit_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(reddit_emb)

X_test = np.array(X_test)

100%|██████████| 375/375 [00:25<00:00, 14.76it/s]


In [30]:
print(X_train.shape, X_test.shape)

(1498, 512) (375, 512)


In [31]:
print(y_train.shape, y_test.shape)

(1498, 2) (375, 2)


In [32]:
X_train.shape[0]

1498

In [33]:
X_train.shape[1]

512

In [None]:
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv1D, Input

In [44]:
model = keras.Sequential()

model.add(keras.layers.Input(shape=(X_train.shape[1], 1)))
model.add(keras.layers.Conv1D(5, (2,), padding='same', activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation='relu', kernel_initializer='he_uniform'))
model.add(keras.layers.Dense(2, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(), 
                              metrics=['accuracy',
                              tf.keras.metrics.Precision(),
                              tf.keras.metrics.Recall(),
                              tfa.metrics.F1Score(num_classes=2)])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_3 (Conv1D)           (None, 512, 5)            15        
                                                                 
 dropout_3 (Dropout)         (None, 512, 5)            0         
                                                                 
 flatten_3 (Flatten)         (None, 2560)              0         
                                                                 
 dense_6 (Dense)             (None, 64)                163904    
                                                                 
 dense_7 (Dense)             (None, 2)                 130       
                                                                 
Total params: 164,049
Trainable params: 164,049
Non-trainable params: 0
_________________________________________________________________


In [45]:
history4 = model4.fit(
    X_train, y_train, 
    epochs=10, 
    batch_size=32, 
    validation_split=0.2, 
    verbose=1, 
    shuffle=True
)
model.evaluate(X_test, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.5421967506408691,
 0.746666669845581,
 0.75,
 0.7440000176429749,
 array([0.73684216, 0.7557841 ], dtype=float32)]