In [None]:
# Install pyspark
! pip install --ignore-installed pyspark

# Install Spark NLP
! pip install --ignore-installed spark-nlp

In [4]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import SparkSession

print("Spark NLP version", sparknlp.version())

Spark NLP version 3.1.2


To use Merge Entities parameter we need to set allowSparkContext parameter to true

In [6]:
from pyspark.sql.types import StringType

text = ['Peter Parker is a nice lad and lives in New York']
data_set = spark.createDataFrame(text, StringType()).toDF("text")
data_set.show(truncate=False)

+------------------------------------------------+
|text                                            |
+------------------------------------------------+
|Peter Parker is a nice lad and lives in New York|
+------------------------------------------------+



# Graph Extraction

Graph Extraction will use pretrained POS, Dependency Parser and Typed Dependency Parser annotators when the pipeline does not have those defined

In [7]:
document_assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained() \
    .setInputCols(["document", "token"]) \
    .setOutputCol("embeddings")

ner_tagger = NerDLModel.pretrained() \
    .setInputCols(["document", "token", "embeddings"]) \
    .setOutputCol("ner")

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


To instruct Graph Extraction to use automatically pretrained POS, Dependency and Typed Dependency Parser annotator, we need to set MergeEntities parameter to True. This parameter will merge neighbor tagging entities into one. e.g. Peter Parker will be consider a single token, before sending it to Dependency Parsers annotators.

In this sentence, we can extract paths for the following pair of tokens-ENTITIES:
* lad-PER, will output the path between *lad* and Peter Parker
* lad-LOC, will output the path between *lad* and New York

Any other pair of token,ENTITY will output an empty path since there is no path between them. You can visualize the dependency tree for this sentence using [sparknlp display package](https://github.com/JohnSnowLabs/spark-nlp-display)

In [8]:
graph_extraction = GraphExtraction() \
            .setInputCols(["document", "token", "ner"]) \
            .setOutputCol("graph") \
            .setRelationshipTypes(["lad-PER", "lad-LOC"]) \
            .setMergeEntities(True) 

Under the hood it uses default pretrained annotators, but we can set any pretrained model with the parameters *setPosModel*, *setDependencyParserModel* or *setTypedDependencyParserModel*

Unlike [this notebook](https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/feature/graph-extraction-tutorial/jupyter/annotation/english/graph-extraction/graph_extraction.ipynb), the pipeline below just has graph extraction + NER + tokenizer annotators

In [9]:
           
graph_pipeline = Pipeline().setStages([document_assembler, tokenizer,
                                       word_embeddings, ner_tagger,
                                       graph_extraction])

The result dataset has a *graph* column with the paths between lad,PER and lad-LOC

In [10]:
graph_data_set = graph_pipeline.fit(data_set).transform(data_set)
graph_data_set.select("graph").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|graph                                                                                                                                                               |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[[node, 23, 25, lad, [relationship -> lad,PER, path1 -> lad,flat,Peter Parker], []], [node, 23, 25, lad, [relationship -> lad,LOC, path1 -> lad,flat,New York], []]]|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+

