In [1]:
import pyspark

In [3]:
!pip install spark-nlp
# !pip install pyspark

Collecting spark-nlp
  Downloading spark_nlp-5.1.1-py2.py3-none-any.whl (531 kB)
[K     |████████████████████████████████| 531 kB 18.5 MB/s eta 0:00:01
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-5.1.1


In [4]:
!pip install numpy

Collecting numpy
  Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[K     |████████████████████████████████| 17.3 MB 19.2 MB/s eta 0:00:01
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.24.4


In [5]:
import sparknlp

# Start Spark Session
spark = sparknlp.start()

:: loading settings :: url = jar:file:/usr/local/spark-3.2.1-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /config/.ivy2/cache
The jars for the packages stored in: /config/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e97e7aa0-f3c6-4560-b680-b458a0cdd6ab;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.1 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in central
	found com.google.guava#failureaccess;1.

In [6]:
# Create a dataframe from the sample_text
data = spark.createDataFrame([
["""As she traveled across the world, Emma visited many different places 
and met many fascinating people. She walked the busy streets of Tokyo, 
hiked the rugged mountains of Nepal, and swam in the crystal-clear waters 
of the Caribbean. Along the way, she befriended locals like Akira, Rajesh, 
and Maria, each with their own unique stories to tell. Emma's travels took her 
to many cities, including New York, Paris, and Hong Kong, where she savored 
delicious foods and explored vibrant cultures. No matter where she went, 
Emma always found new wonders to discover and memories to cherish."""]
]).toDF("text")

In [7]:
# PERSON
person_matches = """
Emma
Akira
Rajesh
Maria
"""

with open('person_matches.txt', 'w') as f:
    f.write(person_matches)

# LOCATION
location_matches = """
Tokyo
Nepal
Caribbean
New York
Paris
Hong Kong
"""

with open('location_matches.txt', 'w') as f:
    f.write(location_matches)

In [8]:
# Import the required modules and classes
from sparknlp.base import DocumentAssembler, Pipeline, ReadAs
from sparknlp.annotator import (
    Tokenizer,
    TextMatcher
)
from pyspark.sql.types import StringType
import pyspark.sql.functions as F

In [9]:
# Step 1: Transforms raw texts to `document` annotation
document_assembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

In [10]:
# Step 2: Gets the tokens of the text    
tokenizer = Tokenizer() \
    .setInputCols("document") \
    .setOutputCol("token")

In [11]:
# Step 3: PERSON matcher
person_extractor = TextMatcher() \
    .setInputCols("document", "token") \
    .setEntities("person_matches.txt", ReadAs.TEXT) \
    .setEntityValue("PERSON") \
    .setOutputCol("person_entity") \
    .setCaseSensitive(False)

In [12]:
# Step 4: LOCATION matcher
location_extractor = TextMatcher() \
    .setInputCols("document", "token") \
    .setEntities("location_matches.txt", ReadAs.TEXT) \
    .setEntityValue("LOCATION") \
    .setOutputCol("location_entity") \
    .setCaseSensitive(False)

In [13]:
pipeline = Pipeline().setStages([document_assembler,
                                 tokenizer,
                                 person_extractor,
                                 location_extractor
                                 ])

In [14]:
# Fit and transform to get a prediction
results = pipeline.fit(data).transform(data)

# Display the results
results.selectExpr("person_entity.result").show(truncate=False)

                                                                                

+----------------------------------+
|result                            |
+----------------------------------+
|[Emma, Akira, Rajesh, Maria, Emma]|
+----------------------------------+



                                                                                

In [15]:
results.selectExpr("location_entity.result").show(truncate=False)

+-----------------------------------------------------+
|result                                               |
+-----------------------------------------------------+
|[Tokyo, Nepal, Caribbean, New York, Paris, Hong Kong]|
+-----------------------------------------------------+



In this example, we created two TextMatcher stages, one matches person names and the other stage matches locations. Once the Spark NLP pipeline is applied to the sample text, any words that match the specified words are extracted.
