## Random Forest classifier for suicide detection in texts 
## (Subsampled Dataset)
##### In this notebook is trated the Imabalanced Dataset after preprocessing and cleaning
 ##### 0: Non-Suicide and 1: Suicide

#### Note: Hyperparameter optimization is done using GridSearchCV

#### Import dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#### Read data

In [15]:
#df = pd.read_csv('suicide_detection_full_cleaned.csv')
df = pd.read_csv('../suicide_detection_final_cleaned.csv')
df.head()

Unnamed: 0,text,class,cleaned_text
0,Ex Wife Threatening SuicideRecently I left my ...,suicide,ex wife threaten suiciderecently leave wife go...
1,Am I weird I don't get affected by compliments...,non-suicide,weird not affect compliment come know irl feel...
2,Finally 2020 is almost over... So I can never ...,non-suicide,finally hear bad year swear fucking god annoying
3,i need helpjust help me im crying so hard,suicide,need helpjust help cry hard
4,It ends tonight.I can’t do it anymore. \nI quit.,suicide,end tonight not anymore quit


#### Checking for data balancing

In [16]:
# ver articulo sobre este tema en:
# https://machinelearningmastery.com/what-is-imbalanced-classification/
df['class'].value_counts()

class
non-suicide    107066
suicide         67902
Name: count, dtype: int64

#### Converting the fields suicide and non-suicide to 0 and 1

In [17]:
df['class']=df['class'].apply(lambda x: 1 if x =='suicide' else 0)
df['cleaned_text'] = df['cleaned_text'].astype('U')
df.dtypes

text            object
class            int64
cleaned_text    object
dtype: object

#### Random Subsampling

In [18]:
df_0 = df[df['class']==0]
df_1 = df[df['class']==1]
size = df_1['class'].size
size

67902

In [19]:
# Now I have to subsample the greater dataset to n = size of the smaller one
df_00 = df_0.sample(n=size, replace=False, random_state=0)
subsample_df = pd.concat([df_00, df_1])
# now the data is balanced by the subsampling method
subsample_df['class'].value_counts()

class
0    67902
1    67902
Name: count, dtype: int64

#### Division of the data for training and testing

In [9]:
X_train, X_test, y_train, y_test = train_test_split(subsample_df['cleaned_text'], subsample_df['class'])

#### Vectorize Training Data

In [10]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)

#### Checking the cross validation metrics

In [None]:
forest_clf = RandomForestClassifier(n_estimators=100,
                               criterion="gini",
                               max_features="sqrt",
                               bootstrap=True,
                               max_samples=2/3,
                               oob_score=True)
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
scores = cross_validate(forest_clf, X_train_count, y_train, scoring=scoring, cv=5)

In [None]:
print(f'CV Training Accuracy:  {round(scores["test_accuracy"].mean(),3)}')
print(f'CV Training Precision: {round(scores["test_precision_macro"].mean(),3)}')
print(f'CV Training Precision: {round(scores["test_recall_macro"].mean(),3)}')
print(f'CV Training Precision: {round(scores["test_f1_macro"].mean(),3)}')

#### Training the model

In [8]:
forest_clf.fit(X_train_count, y_train)

### Testing the model: X_test

In [9]:
X_test_count = v.transform(X_test)
y_pred = forest_clf.predict(X_test_count)
y_pred

0.997348086507247

In [None]:
print(f"Test Accuracy:  {round(accuracy_score(y_test,y_pred),2)}")
print(f"Test Precision: {round(precision_score(y_test,y_pred),2)}")
print(f"Test Recall:    {round(recall_score(y_test,y_pred),2)}")
print(f"Test F1:        {round(f1_score(y_test,y_pred),2)}")

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred, labels = nb_clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = nb_clf.classes_)
disp.plot()
plt.show()

### Hyperparameter tuning with GridSearchCV