In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('combined-selftext.csv')

In [3]:
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]:
df['text'] = str_join(df," ", 'title', 'usertext')
del df['title']
del df['usertext']
df.rename(columns = {'y':'is_suicide'}, inplace = True)
#df.head()

In [5]:
import gensim
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
STOPWORDS = STOPWORDS.union(set(['im', 'ive', 'ill', 'wa', 'ha', 'aint', 'thats', 'la', 'le', 'please', 'feel', 'rly', 'u', 'nan', 'emptypost']))

stop = STOPWORDS
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [6]:
df['text_clean'] = df['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
#df.head()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['text_clean'], df['is_suicide'], test_size=0.2)

In [9]:
w2v_model = gensim.models.Word2Vec(df['text'],
                                   vector_size=300,
                                   epochs=20,
                                   window=10,
                                   min_count=70)

In [10]:
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [11]:
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(300, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(300, dtype=float))

In [12]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
y_train = Encoder.fit_transform(y_train)
y_test = Encoder.fit_transform(y_test)

In [13]:
#Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
MNB = MultinomialNB()
p = Pipeline([('Normalizing',MinMaxScaler()),('MultinomialNB',MultinomialNB())])
p = p.fit(X_train_vect_avg, y_train) 
predictions_MNB = p.predict(X_test_vect_avg)
print("MultinomialNB Accuracy Score:",accuracy_score(predictions_MNB, y_test)*100)
print("MultinomialNB Precision Score:",precision_score(predictions_MNB, y_test)*100)
print("MultinomialNB Recall Score:",recall_score(predictions_MNB, y_test)*100)
print("MultinomialNB F1-score Score:",f1_score(predictions_MNB, y_test)*100)

MultinomialNB Accuracy Score: 65.33333333333333
MultinomialNB Precision Score: 61.57635467980296
MultinomialNB Recall Score: 70.62146892655367
MultinomialNB F1-score Score: 65.78947368421053


In [13]:
print(classification_report(y_test, predictions_MNB))

              precision    recall  f1-score   support

           0       0.62      0.62      0.62       176
           1       0.66      0.66      0.66       199

    accuracy                           0.64       375
   macro avg       0.64      0.64      0.64       375
weighted avg       0.64      0.64      0.64       375



In [14]:
#Support Vector Machine

SVM = svm.SVC()
SVM = SVM.fit(X_train_vect_avg, y_train)
predictions_SVM = SVM.predict(X_test_vect_avg)
print("SVM Accuracy Score:",accuracy_score(predictions_SVM, y_test)*100)
print("SVM Precision Score:",precision_score(predictions_SVM, y_test)*100)
print("SVM Recall Score:",recall_score(predictions_SVM, y_test)*100)
print("SVM F1-score Score:",f1_score(predictions_SVM, y_test)*100)

SVM Accuracy Score: 69.6
SVM Precision Score: 70.44334975369459
SVM Recall Score: 72.58883248730965
SVM F1-score Score: 71.50000000000001


In [15]:
print(classification_report(y_test,predictions_SVM))

              precision    recall  f1-score   support

           0       0.65      0.58      0.61       176
           1       0.66      0.73      0.69       199

    accuracy                           0.66       375
   macro avg       0.66      0.65      0.65       375
weighted avg       0.66      0.66      0.66       375



In [25]:
#Random Forest

from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF = RF.fit(X_train_vect_avg, y_train)
predictions_RF = RF.predict(X_test_vect_avg)
print("RF Accuracy Score:",accuracy_score(predictions_RF, y_test)*100)
print("RF Precision Score:",precision_score(predictions_RF, y_test)*100)
print("RF Recall Score:",recall_score(predictions_RF, y_test)*100)
print("RF F1-score Score:",f1_score(predictions_RF, y_test)*100)

RF Accuracy Score: 66.93333333333334
RF Precision Score: 66.99507389162561
RF Recall Score: 70.46632124352331
RF F1-score Score: 68.68686868686868


In [17]:
print(classification_report(y_test,predictions_RF))

              precision    recall  f1-score   support

           0       0.64      0.57      0.60       176
           1       0.65      0.72      0.68       199

    accuracy                           0.65       375
   macro avg       0.65      0.64      0.64       375
weighted avg       0.65      0.65      0.65       375



In [16]:
#Logistic Regression

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR = LR.fit(X_train_vect_avg, y_train)
predictions_LR = LR.predict(X_test_vect_avg)
print("Logistic Regression Accuracy Score:",accuracy_score(predictions_LR, y_test)*100)
print("Logistic Regression Precision Score:",precision_score(predictions_LR, y_test)*100)
print("Logistic Regression Recall Score:",recall_score(predictions_LR, y_test)*100)
print("Logistic Regression F1-score Score:",f1_score(predictions_LR, y_test)*100)

Logistic Regression Accuracy Score: 68.26666666666667
Logistic Regression Precision Score: 69.95073891625616
Logistic Regression Recall Score: 71.0
Logistic Regression F1-score Score: 70.47146401985111


In [19]:
print(classification_report(y_test,predictions_LR))

              precision    recall  f1-score   support

           0       0.68      0.63      0.65       176
           1       0.69      0.73      0.71       199

    accuracy                           0.69       375
   macro avg       0.68      0.68      0.68       375
weighted avg       0.68      0.69      0.68       375



In [18]:
#Decision Tree

from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(X_train_vect_avg, y_train)
predictions_DT = DT.predict(X_test_vect_avg)
print("Decision Tree Accuracy Score:",accuracy_score(predictions_DT, y_test)*100)
print("Decision Tree Precision Score:",precision_score(predictions_DT, y_test)*100)
print("Decision Tree Recall Score:",recall_score(predictions_DT, y_test)*100)
print("Decision Tree F1-Score:",f1_score(predictions_DT, y_test)*100)

Decision Tree Accuracy Score: 57.599999999999994
Decision Tree Accuracy Score: 61.083743842364534
Decision Tree Accuracy Score: 60.78431372549019
Decision Tree Accuracy Score: 60.933660933660924


In [23]:
print(classification_report(y_test,predictions_DT))

              precision    recall  f1-score   support

           0       0.58      0.57      0.57       176
           1       0.62      0.63      0.63       199

    accuracy                           0.60       375
   macro avg       0.60      0.60      0.60       375
weighted avg       0.60      0.60      0.60       375



In [20]:
#Light Gradient Boosting

import lightgbm as lgb
LGB = lgb.LGBMClassifier()
LGB.fit(X_train_vect_avg, y_train)
predictions_LGB = LGB.predict(X_test_vect_avg)
print("LGB Accuracy Score:",accuracy_score(predictions_LGB, y_test)*100)
print("LGB Precision Score:",precision_score(predictions_LGB, y_test)*100)
print("LGB Recall Score:",recall_score(predictions_LGB, y_test)*100)
print("LGB F1-score Score:",f1_score(predictions_LGB, y_test)*100)

LGB Accuracy Score: 68.8
LGB Precision Score: 73.39901477832512
LGB Recall Score: 70.28301886792453
LGB F1-score Score: 71.80722891566266


In [29]:
print(classification_report(y_test,predictions_LGB))

              precision    recall  f1-score   support

           0       0.61      0.57      0.59       176
           1       0.64      0.68      0.66       199

    accuracy                           0.63       375
   macro avg       0.63      0.63      0.63       375
weighted avg       0.63      0.63      0.63       375



In [27]:
from mlxtend.classifier import StackingClassifier
clf_stack = StackingClassifier(classifiers =[LGB, XGB, RF, SVM], meta_classifier = LR)
model_stack = clf_stack.fit(X_train_vect_avg, y_train)
pred_stack = model_stack.predict(X_test_vect_avg)	 # predictions on test data using stacked model
acc_stack = accuracy_score(y_test, pred_stack)
precision_stack = precision_score(y_test, pred_stack)
recall_stack = recall_score(y_test, pred_stack)
f1_stack = f1_score(y_test, pred_stack) # evaluating accuracy
print('accuracy score of Stacked model:', acc_stack * 100)
print('precision score of Stacked model:', precision_stack * 100)
print('recall score of Stacked model:', recall_stack * 100)
print('f1 score of Stacked model:', f1_stack * 100)

accuracy score of Stacked model: 67.2
precision score of Stacked model: 70.0
recall score of Stacked model: 68.96551724137932
f1 score of Stacked model: 69.4789081885856


In [39]:
print(classification_report(y_test,pred_stack))

              precision    recall  f1-score   support

           0       0.65      0.57      0.60       176
           1       0.65      0.72      0.69       199

    accuracy                           0.65       375
   macro avg       0.65      0.65      0.65       375
weighted avg       0.65      0.65      0.65       375

