In [None]:
# Install pyspark
! pip install --ignore-installed pyspark

# Install Spark NLP
! pip install --ignore-installed spark-nlp

In [4]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import SparkSession

print("Spark NLP version", sparknlp.version())

Spark NLP version 3.1.2


To use Merge Entities parameter we need to set allowSparkContext parameter to true

In [5]:
spark = SparkSession.builder \
    .appName("SparkNLP") \
    .master("local[*]") \
    .config("spark.driver.memory", "12G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.jars", "jars/sparknlp.jar") \
    .config("spark.executor.allowSparkContext", "true") \
    .getOrCreate()

spark

In [6]:
from pyspark.sql.types import StringType

text = ['Peter Parker is a nice lad and lives in New York']
data_set = spark.createDataFrame(text, StringType()).toDF("text")
data_set.show(truncate=False)

+------------------------------------------------+
|text                                            |
+------------------------------------------------+
|Peter Parker is a nice lad and lives in New York|
+------------------------------------------------+



# Graph Extraction

Graph Extraction will use pretrained POS, Dependency Parser and Typed Dependency Parser annotators when the pipeline does not have those defined

In [7]:
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained() \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings")

ner_tagger = NerDLModel.pretrained() \
    .setInputCols(["document", "token", "embeddings"]) \
    .setOutputCol("ner")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


When setting ExplodeEntities to true, Graph Extraction will find paths between all possible pair of entities

Since this sentence only has two entities, it will display the paths between PER and LOC. Each pair of entities will have a left path and a right path. By default the paths starts from the root of the dependency tree, which in this case is the token *lad*:
* Left path: lad-PER, will output the path between lad and Peter Parker
* Right path: lad-LOC, will output the path between lad and New York

In [8]:
graph_extraction = GraphExtraction() \
            .setInputCols(["document", "token", "ner"]) \
            .setOutputCol("graph") \
            .setMergeEntities(True) \
            .setExplodeEntities(True)

In [9]:
           
graph_pipeline = Pipeline().setStages([document_assembler, tokenizer,
                                       word_embeddings, ner_tagger,
                                       graph_extraction])

The result dataset has a *graph* column with the paths between PER,LOC

In [10]:
graph_data_set = graph_pipeline.fit(data_set).transform(data_set)
graph_data_set.select("graph").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------+
|graph                                                                                                                                             |
+--------------------------------------------------------------------------------------------------------------------------------------------------+
|[[node, 23, 25, lad, [entities -> PER,LOC, path -> lad,Peter Parker,lad,New York, left_path -> lad,Peter Parker, right_path -> lad,New York], []]]|
+--------------------------------------------------------------------------------------------------------------------------------------------------+

