In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow_hub as hub

In [2]:
df = pd.read_csv('combined-selftext.csv')
#df.head()

In [3]:
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]:
df['text'] = str_join(df," ", 'title', 'usertext')

In [5]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
STOPWORDS = STOPWORDS.union(set(['im', 'ive', 'ill', 'wa', 'ha', 'aint', 'thats', 'la', 'le', 'please', 'feel', 'rly', 'u', 'nan', 'emptypost']))

stop = STOPWORDS
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [6]:
del df['title']
del df['usertext']

In [7]:
df.head()

Unnamed: 0,y,text
0,0,need help hi know phrase situation try life go...
1,1,feeling overwhelmed hopeless depressed past co...
2,0,matter anymore getting worse hi know devastate...
3,1,tired hearing bullshit shit like better purpos...
4,0,wish wish prettier wish like burden wish broke...


In [7]:
df["is_suicide"] = df["y"].apply(lambda x: "depressed" if x < 1 else "suicidal")

In [8]:
suicidal_reddits = df[df.is_suicide == "suicidal"]
depressed_reddits = df[df.is_suicide == "depressed"]

In [9]:
#suicidal_df = suicidal_reddits.sample(n=len(depressed_reddits), random_state=RANDOM_SEED)
suicidal_df = suicidal_reddits
depressed_df = depressed_reddits

In [14]:
reddits_df = (pd.concat([suicidal_df, depressed_df]))

In [11]:
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

In [16]:
train_reddits, test_reddits, y_train, y_test =\
  train_test_split(
    reddits_df.text, 
    reddits_df.is_suicide, 
    test_size=.2, 
    random_state=42
  )

In [17]:
X_train = []
for r in tqdm(train_reddits):
  emb = use(r)
  reddit_emb = tf.reshape(emb, [-1]).numpy()
  X_train.append(reddit_emb)

X_train = np.array(X_train)

100%|██████████| 1498/1498 [00:17<00:00, 83.49it/s] 


In [18]:
X_test = []
for r in tqdm(test_reddits):
  emb = use(r)
  reddit_emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(reddit_emb)

X_test = np.array(X_test)

100%|██████████| 375/375 [00:04<00:00, 81.85it/s]


In [19]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [22]:
print(X_train.shape, X_test.shape)

(1498, 512) (375, 512)


In [21]:
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [22]:
#Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
MNB = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
MNB.fit(X_train, y_train)
predictions_MNB = MNB.predict(X_test)

print(classification_report(y_test, predictions_MNB))
print("MultinomialNB Accuracy Score:",accuracy_score(predictions_MNB, y_test)*100)
print("MultinomialNB Precision Score:",precision_score(predictions_MNB, y_test)*100)
print("MultinomialNB Recall Score:",recall_score(predictions_MNB, y_test)*100)
print("MultinomialNB F1-score Score:",f1_score(predictions_MNB, y_test)*100)

              precision    recall  f1-score   support

           0       0.67      0.71      0.69       182
           1       0.71      0.67      0.69       193

    accuracy                           0.69       375
   macro avg       0.69      0.69      0.69       375
weighted avg       0.69      0.69      0.69       375

MultinomialNB Accuracy Score: 68.8
MultinomialNB Precision Score: 66.83937823834198
MultinomialNB Recall Score: 70.87912087912088
MultinomialNB F1-score Score: 68.80000000000001


In [25]:
#Support Vector Machine

from sklearn import svm
SVM = svm.SVC()
SVM.fit(X_train, y_train)
predictions_SVM = SVM.predict(X_test)

#print("SVM Accuracy Score:",accuracy_score(predictions_SVM, y_test)*100)
print(classification_report(y_test, predictions_SVM))
print("SVM Accuracy Score:",accuracy_score(predictions_SVM, y_test)*100)
print("SVM Precision Score:",precision_score(predictions_SVM, y_test)*100)
print("SVM Recall Score:",recall_score(predictions_SVM, y_test)*100)
print("SVM F1-score Score:",f1_score(predictions_SVM, y_test)*100)

              precision    recall  f1-score   support

           0       0.71      0.65      0.68       182
           1       0.70      0.75      0.72       193

    accuracy                           0.70       375
   macro avg       0.70      0.70      0.70       375
weighted avg       0.70      0.70      0.70       375

SVM Accuracy Score: 70.39999999999999
SVM Precision Score: 75.12953367875647
SVM Recall Score: 69.71153846153845
SVM F1-score Score: 72.31920199501246


In [26]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
predictions_RF = RF.predict(X_test)

print(classification_report(y_test, predictions_RF))
print("RF Accuracy Score:",accuracy_score(predictions_RF, y_test)*100)
print("RF Precision Score:",precision_score(predictions_RF, y_test)*100)
print("RF Recall Score:",recall_score(predictions_RF, y_test)*100)
print("RF F1-score Score:",f1_score(predictions_RF, y_test)*100)

              precision    recall  f1-score   support

           0       0.69      0.67      0.68       182
           1       0.70      0.71      0.70       193

    accuracy                           0.69       375
   macro avg       0.69      0.69      0.69       375
weighted avg       0.69      0.69      0.69       375

RF Accuracy Score: 69.06666666666666
RF Precision Score: 70.98445595854922
RF Recall Score: 69.54314720812182
RF F1-score Score: 70.25641025641026


In [24]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)
predictions_LR = LR.predict(X_test)

print(classification_report(y_test, predictions_LR))
print("Logistic Regression Accuracy Score:",accuracy_score(predictions_LR, y_test)*100)
print("Logistic Regression Precision Score:",precision_score(predictions_LR, y_test)*100)
print("Logistic Regression Recall Score:",recall_score(predictions_LR, y_test)*100)
print("Logistic Regression F1-score Score:",f1_score(predictions_LR, y_test)*100)

              precision    recall  f1-score   support

           0       0.71      0.69      0.70       182
           1       0.72      0.74      0.73       193

    accuracy                           0.71       375
   macro avg       0.71      0.71      0.71       375
weighted avg       0.71      0.71      0.71       375

Logistic Regression Accuracy Score: 71.46666666666667
Logistic Regression Precision Score: 73.57512953367875
Logistic Regression Recall Score: 71.71717171717171
Logistic Regression F1-score Score: 72.63427109974423


In [28]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(X_train, y_train)
predictions_DT = DT.predict(X_test)

print(classification_report(y_test, predictions_DT))
print("Decision Tree Accuracy Score:",accuracy_score(predictions_DT, y_test)*100)
print("Decision Tree Precision Score:",precision_score(predictions_DT, y_test)*100)
print("Decision Tree Recall Score:",recall_score(predictions_DT, y_test)*100)
print("Decision Tree f1 Score:",f1_score(predictions_DT, y_test)*100)

              precision    recall  f1-score   support

           0       0.56      0.51      0.53       182
           1       0.57      0.62      0.59       193

    accuracy                           0.57       375
   macro avg       0.56      0.56      0.56       375
weighted avg       0.56      0.57      0.56       375

Decision Tree Accuracy Score: 56.53333333333334
Decision Tree Precision Score: 61.6580310880829
Decision Tree Recall Score: 57.21153846153846
Decision Tree f1 Score: 59.35162094763092


In [27]:
#Light Gradient Boosting

!pip install lightgbm
import lightgbm as lgb
LGB = lgb.LGBMClassifier()
LGB.fit(X_train, y_train)
predictions_LGB = LGB.predict(X_test)

print(classification_report(y_test, predictions_LGB))
print("LGB Accuracy Score:",accuracy_score(predictions_LGB, y_test)*100)
print("LGB Precision Score:",precision_score(predictions_LGB, y_test)*100)
print("LGB Recall Score:",recall_score(predictions_LGB, y_test)*100)
print("LGB F1-score Score:",f1_score(predictions_LGB, y_test)*100)

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


              precision    recall  f1-score   support

           0       0.69      0.63      0.66       182
           1       0.67      0.73      0.70       193

    accuracy                           0.68       375
   macro avg       0.68      0.68      0.68       375
weighted avg       0.68      0.68      0.68       375

LGB Accuracy Score: 68.0
LGB Precision Score: 73.05699481865285
LGB Recall Score: 67.46411483253588
LGB F1-score Score: 70.1492537313433


In [38]:
#!pip install mlxtend

In [32]:
from mlxtend.classifier import StackingClassifier
clf_stack = StackingClassifier(classifiers =[LGB, XGB, RF, SVM], meta_classifier = LR)
model_stack = clf_stack.fit(X_train, y_train)
pred_stack = model_stack.predict(X_test)

acc_stack = accuracy_score(y_test, pred_stack)
precision_stack = precision_score(y_test, pred_stack)
recall_stack = recall_score(y_test, pred_stack)
f1_stack = f1_score(y_test, pred_stack)# evaluating accuracy
print('accuracy score of Stacked model:', acc_stack * 100)
print('precision score of Stacked model:', precision_stack * 100)
print('recall score of Stacked model:', recall_stack * 100)
print('f1 score of Stacked model:', f1_stack * 100)
print(classification_report(y_test,pred_stack))

accuracy score of Stacked model: 68.53333333333333
precision score of Stacked model: 68.29268292682927
recall score of Stacked model: 72.53886010362694
f1 score of Stacked model: 70.35175879396985
              precision    recall  f1-score   support

           0       0.69      0.64      0.66       182
           1       0.68      0.73      0.70       193

    accuracy                           0.69       375
   macro avg       0.69      0.68      0.68       375
weighted avg       0.69      0.69      0.68       375

