In [1]:
import findspark
findspark.init() 


In [2]:
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

In [3]:
sparknlp.start()

In [4]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  3.3.2
Apache Spark version:  3.0.1


In [5]:
sparknlp.start()

In [7]:
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4")\
    .getOrCreate()

### Offline
If you have any trouble using online pipelines or models in your environment (maybe it’s air-gapped), you can directly download them for offline use.

After downloading offline models/pipelines and extracting them, here is how you can use them iside your code (the path could be a shared storage like HDFS in a cluster):

val advancedPipeline = PipelineModel.load("/tmp/explain_document_dl_en_2.0.2_2.4_1556530585689/")
// To use the loaded Pipeline for prediction
advancedPipeline.transform(predictionDF)

In [9]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.training import *
from pyspark.ml import Pipeline


# First extract the prerequisites for the NerDLApproach
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentence = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

embeddings = BertEmbeddings.pretrained() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

# Then the training can start
nerTagger = NerDLApproach() \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setLabelColumn("label") \
    .setOutputCol("ner") \
    .setMaxEpochs(1) \
    .setRandomSeed(0) \
    .setVerbose(0)

pipeline = Pipeline().setStages([
    documentAssembler,
    sentence,
    tokenizer,
    embeddings,
    nerTagger
])

# We use the text and labels from the CoNLL dataset


small_bert_L2_768 download started this may take some time.
Approximate size to download 139.6 MB
[OK!]


In [14]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

#create or get Spark Session

spark = sparknlp.start()

sparknlp.version()
spark.version

#download, load and annotate a text by pre-trained pipeline

pipeline = PretrainedPipeline('recognize_entities_dl', 'en')
result = pipeline.annotate('The Mona Lisa is a 16th century oil painting created by Leonardo')


recognize_entities_dl download started this may take some time.
Approx size to download 160.1 MB
[OK!]


In [15]:
result

{'entities': ['Mona Lisa', 'Leonardo'],
 'document': ['The Mona Lisa is a 16th century oil painting created by Leonardo'],
 'token': ['The',
  'Mona',
  'Lisa',
  'is',
  'a',
  '16th',
  'century',
  'oil',
  'painting',
  'created',
  'by',
  'Leonardo'],
 'ner': ['O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER'],
 'embeddings': ['The',
  'Mona',
  'Lisa',
  'is',
  'a',
  '16th',
  'century',
  'oil',
  'painting',
  'created',
  'by',
  'Leonardo'],
 'sentence': ['The Mona Lisa is a 16th century oil painting created by Leonardo']}

In [16]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document")

sentenceDetector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

token = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token") \
    .setContextChars(["(", "]", "?", "!", ".", ","])

keywords = YakeKeywordExtraction() \
    .setInputCols(["token"]) \
    .setOutputCol("keywords") \
    .setThreshold(0.6) \
    .setMinNGrams(2) \
    .setNKeywords(10)

pipeline = Pipeline().setStages([
    documentAssembler,
    sentenceDetector,
    token,
    keywords
])

data = spark.createDataFrame([[
    "contamination of meat with antimicrobial-resistant bacteria represents a major public health threat worldwide. in this study\, we determined the antimicrobial resistance profiles and resistance trends of staphylococcus aureus isolated from major food animal carcasses (408 cattle\, 1196 pig\, and 1312 chicken carcass isolates) in korea from 2010 to 2018. approximately 75%\, 92%\, and 77% of cattle\, pig\, and chicken carcass isolates\, respectively\, were resistant to at least one antimicrobial agent. resistance to penicillin (62.1%) was the highest\, followed by resistance to tetracycline (42.1%) and erythromycin (28.2%). about 30% of pig and chicken isolates were resistant to ciprofloxacin. we observed linezolid resistance only in pig isolates (2.3%). however\, all s. aureus isolates were sensitive to rifampin and vancomycin. we noted an increasing but fluctuating trend of kanamycin and penicillin resistance in cattle isolates. similarly\, the chloramphenicol\, ciprofloxacin\, tetracycline\, and trimethoprim resistance rates were increased but fluctuated through time in pig isolates. methicillin-resistant s. aureus (mrsa) accounted for 5%\, 8%\, and 9% of the cattle\, pig\, and chicken isolates\, respectively. the mrsa strains exhibited significantly high resistance rates to most of the tested antimicrobials\, including ciprofloxacin\, erythromycin\, and tetracycline compared with methicillin-susceptible s. aureus (mssa) strains. notably\, a relatively high percentage of mrsa strains (5.2%) recovered from pig carcasses were resistant to linezolid compared with mssa strains (2.1%). in addition\, almost 37% of the isolates were multi-drug resistant. s. aureus isolates recovered from major food animal carcasses in korea exhibited resistance to clinically important antimicrobials\, posing a public health risk."
]]).toDF("text")
result = pipeline.fit(data).transform(data)

# combine the result and score (contained in keywords.metadata)
scores = result \
    .selectExpr("explode(arrays_zip(keywords.result, keywords.metadata)) as resultTuples") \
    .selectExpr("resultTuples['0'] as keyword", "resultTuples['1'].score as score")

# Order ascending, as lower scores means higher importance
scores.orderBy("score").show(5, truncate = False)

+------------+-----------------+
|keyword     |score            |
+------------+-----------------+
|pig isolates|0.594374093018378|
|pig isolates|0.594374093018378|
+------------+-----------------+

