<a href="https://colab.research.google.com/github/Nikhileswar-Komati/Suicide_Ideation/blob/master/S_VS_T_USE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.4.5


In [2]:
import sparknlp

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version", spark.version)

Spark NLP version:  2.4.5
Apache Spark version 2.4.4


In [3]:
os.environ['KAGGLE_USERNAME'] = "nikhileswarkomati"
os.environ['KAGGLE_KEY'] = "001b3a30170775e55950edb6ff0c9b17"
!kaggle datasets download -d nikhileswarkomati/suicide-watch

Downloading suicide-watch.zip to /content
 93% 107M/115M [00:01<00:00, 68.9MB/s] 
100% 115M/115M [00:01<00:00, 82.0MB/s]


In [4]:
!unzip '/content/suicide-watch.zip'

Archive:  /content/suicide-watch.zip
  inflating: SuicideAndDepression_Detection.csv  


In [5]:
spark_df = spark.read.option("header", "true").option("multiLine", "true").option("quote", "\"").option("escape", "\"").option("inferSchema", "true").csv('/content/SuicideAndDepression_Detection.csv', sep = ',')

In [7]:
spark_split_df = spark_df.filter("class != 'depression'")

(train_data, test_data, val_data) = spark_split_df.randomSplit([0.03, 0.02, 0.9], 24)

print("Train length", train_data.count())
print("Test length", test_data.count())
print("validation length", val_data.count())

Train length 7470
Test length 4901
validation length 219703


In [8]:
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [10]:
document = DocumentAssembler().setInputCol("text").setOutputCol("document")

use = UniversalSentenceEncoder.pretrained() \
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
sentimentdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("output")\
  .setLabelColumn("class")\
  .setMaxEpochs(20)\
  .setEnableOutputLogs(True)

pipeline = Pipeline(
    stages = [
        document,
        use,
        sentimentdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [11]:
pipelineModel = pipeline.fit(train_data)

In [12]:
!cd ~/annotator_logs && ls -l

total 4
-rw-r--r-- 1 root root 1797 May  8 09:42 ClassifierDLApproach_09856eefc1aa.log


In [13]:
!cat ~/annotator_logs/ClassifierDLApproach_09856eefc1aa.log

Training started - total epochs: 20 - learning rate: 0.005 - batch size: 64 - training examples: 7470
Epoch 0/20 - 2.781973546%.2fs - loss: 49.50068 - accuracy: 0.9047449 - batches: 117
Epoch 1/20 - 2.402888786%.2fs - loss: 46.58053 - accuracy: 0.932276 - batches: 117
Epoch 2/20 - 2.443861267%.2fs - loss: 44.75196 - accuracy: 0.9373126 - batches: 117
Epoch 3/20 - 2.412652939%.2fs - loss: 44.307034 - accuracy: 0.94256586 - batches: 117
Epoch 4/20 - 2.494146079%.2fs - loss: 44.160168 - accuracy: 0.9451251 - batches: 117
Epoch 5/20 - 2.9063267%.2fs - loss: 44.207294 - accuracy: 0.9462027 - batches: 117
Epoch 6/20 - 2.385497076%.2fs - loss: 44.146 - accuracy: 0.947415 - batches: 117
Epoch 7/20 - 2.358323381%.2fs - loss: 43.81656 - accuracy: 0.9484926 - batches: 117
Epoch 8/20 - 2.449807016%.2fs - loss: 43.225487 - accuracy: 0.94889665 - batches: 117
Epoch 9/20 - 2.383400265%.2fs - loss: 43.59375 - accuracy: 0.948762 - batches: 117
Epoch 10/20 - 2.383459633%.2fs - loss: 44.503136 - accuracy

In [14]:
df = pipelineModel.transform(test_data)

In [15]:
df.show(5)

+--------------------+------------+--------------------+--------------------+--------------------+
|                text|       class|            document| sentence_embeddings|              output|
+--------------------+------------+--------------------+--------------------+--------------------+
|"Are you afraid t...|   teenagers|[[document, 0, 22...|[[sentence_embedd...|[[category, 0, 22...|
|"Aren't you so gl...|SuicideWatch|[[document, 0, 43...|[[sentence_embedd...|[[category, 0, 43...|
|"Drinking Sprite ...|   teenagers|[[document, 0, 94...|[[sentence_embedd...|[[category, 0, 94...|
|"Friendly" and ca...|SuicideWatch|[[document, 0, 12...|[[sentence_embedd...|[[category, 0, 12...|
|"I can't breathe....|   teenagers|[[document, 0, 76...|[[sentence_embedd...|[[category, 0, 76...|
+--------------------+------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [16]:
from sklearn.metrics import accuracy_score, classification_report

df = pipelineModel.transform(test_data).select('text', 'class', 'output.result').toPandas()
df['result'] = df['result'].apply(lambda x: x[0])
print(accuracy_score(df['result'], df['class']))
print(classification_report(df['result'], df['class']))

0.9251173229953071
              precision    recall  f1-score   support

SuicideWatch       0.93      0.92      0.92      2437
   teenagers       0.92      0.93      0.93      2464

    accuracy                           0.93      4901
   macro avg       0.93      0.93      0.93      4901
weighted avg       0.93      0.93      0.93      4901

