## SparkNLP tutorial

In [None]:
# Install the sparknlp package and make a tar file to distribute the package across workers
!conda install -y -c johnsnowlabs spark-nlp==3.4.3
!conda pack -f -o base_conda_env.tar.gz

In [2]:
import os

# add the maven packages you want to use
spark_packages = [
    "io.delta:delta-core_2.12:1.1.0",
    "org.apache.hadoop:hadoop-aws:3.3.1",
    "com.johnsnowlabs.nlp:spark-nlp-spark32_2.12:3.4.3", # for sparknlp usage
]
spark_packages = ",".join(spark_packages)

os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java" # comment for java8
os.environ['PYSPARK_PYTHON'] = "./environment/bin/python"
os.environ['PYSPARK_SUBMIT_ARGS'] = f'--packages "{spark_packages}" pyspark-shell'

import pyspark
from delta import configure_spark_with_delta_pip

namespace = os.environ["NAMESPACE"] # usually "firstname-lastname"
notebook_name = os.environ["NOTEBOOK_NAME"] # might be helpful

builder = (
    pyspark.sql.SparkSession.builder.appName(f"{namespace}-spark-app")
    .config("spark.archives", "base_conda_env.tar.gz#environment") # required when you want to use your installed packages on spark workers
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.WebIdentityTokenCredentialsProvider") # Either use built in authentication for S3
    # The section with `spark.kubernetes.executor.volumes.persistentVolumeClaim` is for
    # specifying the usage of a local volume to enable more storage space for Disk Spilling
    # If not need, just completely remove the properties
    # you need only to modify the necessary size for the volume under `sizeLimit`
    # .config("spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.options.claimName", "OnDemand") # disk storage for spilling
    # .config("spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.options.storageClass", "efs-csi") # disk storage for spilling
    # .config("spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.options.sizeLimit", "100Gi") # disk storage for spilling
    # .config("spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.mount.path", "/data") # disk storage for spilling
    # .config("spark.kubernetes.executor.volumes.persistentVolumeClaim.spark-local-dir-1.mount.readOnly", "false") # disk storage for spilling
    # The section with `spark.kubernetes.node.selector` is for specifying
    # what nodes to use for the executor and in which Availability Zone (AZ)
    # They need to be in the same zone
    # .config("spark.kubernetes.node.selector.topology.ebs.csi.aws.com/zone", "eu-central-1a") # node selector
    # .config("spark.kubernetes.node.selector.plural.sh/scalingGroup", "xlarge-mem-optimized-on-demand") # node selector, read "Node Groups for the Spark Executors"
    .config("spark.executor.instances", "2") # number of Executors
    .config("spark.executor.memory", "6g") # Executor memory
    .config("spark.executor.cores", "1") # Executor cores
    # .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp-spark32_2.12:3.4.3")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.johnsnowlabs.nlp#spark-nlp-spark32_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-70cfd393-0564-4c39-a02e-2d3ebc7d5fb9;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/conda/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found io.delta#delta-core_2.12;1.1.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
	found org.apache.hadoop#hadoop-aws;3.3.1 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.901 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found com.johnsnowlabs.nlp#spark-nlp-spark32_2.12;3.4.3 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8

In [3]:
import sparknlp
spark.version, sparknlp.version()

('3.2.1', '3.4.3')

## Using SparkNLP Models

In distributed settings, SparkNLP only supports loading files from distributed file systems like (hdfs, s3, etc). 

So, we have to download the models from the [SparkNLP ModelHub](https://nlp.johnsnowlabs.com/models) and add it to the `s3://opengptx/sparknlp-models` folder.

Another point to note, since we are using the latest version of the Spark, it might be some of the models are not supported with our deployment (like models compatible with Spark 2.x). 

If you select the [Models for SparkNLP version >= 3.4.x](https://nlp.johnsnowlabs.com/models?edition=Spark+NLP+3.4), the chances for being compatible is very high. 

In [None]:
from sparknlp.pretrained import PretrainedPipeline, PipelineModel

pipeline_dl = PipelineModel.load("s3a://opengptx/sparknlp-models/onto_recognize_entities_electra_small_en_3.0.0_3.0_1616444187316")

In [5]:
sentences = [
  ['SparkNLP is from John Snow Labs!'],
  ['Apple products are overpriced']
]

In [6]:
data = spark.createDataFrame(sentences).toDF("text")

In [7]:
annotations_df = pipeline_dl.transform(data)

In [8]:
annotations_df.show()

[Stage 22:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|            entities|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|SparkNLP is from ...|[{document, 0, 31...|[{document, 0, 31...|[{token, 0, 7, Sp...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 0, 7, Sp...|
|Apple products ar...|[{document, 0, 28...|[{document, 0, 28...|[{token, 0, 4, Ap...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 0, 4, Ap...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



                                                                                

In [9]:
annotations_df.select("entities").show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|entities                                                                                                                                                   |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+
|[{chunk, 0, 7, SparkNLP, {entity -> ORG, sentence -> 0, chunk -> 0}, []}, {chunk, 17, 31, John Snow Labs!, {entity -> ORG, sentence -> 0, chunk -> 1}, []}]|
|[{chunk, 0, 4, Apple, {entity -> ORG, sentence -> 0, chunk -> 0}, []}]                                                                                     |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------+

