In [1]:
#Importing the correct packages and updates
import os
# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version
# Install pyspark
! pip install --ignore-installed pyspark==2.4.4
# Install Spark NLP
! pip install --ignore-installed spark-nlp


openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Processing /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471/pyspark-2.4.4-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.4
Collecting spark-nlp
  Using cached https://files.pythonhosted.org/packages/bc/c5/cd56aa082c8deed5356e7d9b606c97afb543e429668ea8c2892cecdf70a0/spark_nlp-2.7.2-py2.py3-none-any.whl
Installing collected packages: spark-nlp
Successfully installed spark-nlp-2.7.2


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
import sparknlp
spark = sparknlp.start()
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.7.2
Apache Spark version:  2.4.4


In [3]:
from pathlib import Path
import urllib.request
download_path = "./eng.train"
if not Path(download_path).is_file():
	print("File Not found will downloading it!")
	url = "https://raw.githubusercontent.com/patverga/torch-ner-nlp-from-scratch/master/data/conll2003/eng.train"
	urllib.request.urlretrieve(url, download_path)
else:
	print("File already present.")

File already present.


In [22]:
#Converting the CoNLL file to Spark data frame with all the correct fields 
from sparknlp.training import CoNLL
training_data = CoNLL().readDataset(spark, './eng.train')
training_data.show()

KeyboardInterrupt: ignored

In [5]:
bert = BertEmbeddings.pretrained('bert_base_cased', 'en') \
.setInputCols(["sentence",'token'])\
.setOutputCol("bert")\
.setCaseSensitive(False)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


In [6]:
nerTagger = NerDLApproach()\
.setInputCols(["sentence", "token", "bert"])\
.setLabelColumn("label")\
.setOutputCol("ner")\
.setMaxEpochs(1)\
.setRandomSeed(0)\
.setVerbose(1)\
.setValidationSplit(0.2)\
.setEvaluationLogExtended(True)\
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\
.setTestDataset("test_withEmbeds.parquet")

In [7]:
#Downloading the test data set, preparing it for transformation

from pathlib import Path
import urllib.request
download_path = "./eng.testa"
if not Path(download_path).is_file():
	print("File Not found will downloading it!")
	url = "https://raw.githubusercontent.com/patverga/torch-ner-nlp-from-scratch/master/data/conll2003/eng.testa"
	urllib.request.urlretrieve(url, download_path)
else:
	print("File already present.")

File Not found will downloading it!


In [8]:
test_data = CoNLL().readDataset(spark, './eng.testa')
test_data = bert.transform(test_data)
test_data.write.parquet("test_withEmbeds.parquet")

In [9]:
ner_pipeline = Pipeline(stages = [bert, nerTagger])
ner_model = ner_pipeline.fit(training_data.limit(100))

In [10]:
predictions = ner_model.transform(test_data)
predictions.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|                bert|                 ner|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|CRICKET - LEICEST...|[[document, 0, 64...|[[document, 0, 64...|[[token, 0, 6, CR...|[[pos, 0, 6, NNP,...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|   LONDON 1996-08-30|[[document, 0, 16...|[[document, 0, 16...|[[token, 0, 5, LO...|[[pos, 0, 5, NNP,...|[[named_entity, 0...|[[word_embeddings...|[[named_entity, 0...|
|West Indian all-r...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 3, We...|[[pos, 0, 3, NNP,...|[[named_entity, 0...|[[word_embeddings...|[[

In [11]:
#Showing the Schema
predictions.printSchema()

root
 |-- text: string (nullable = true)
 |-- document: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = tr

In [12]:
import pyspark.sql.functions as F
predictions.select(F.explode(F.arrays_zip('token.result', 'label.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
		F.expr("cols['1']").alias("ground_truth"),
		F.expr("cols['2']").alias("prediction")).show(truncate=False)

+--------------+------------+----------+
|token         |ground_truth|prediction|
+--------------+------------+----------+
|CRICKET       |O           |O         |
|-             |O           |O         |
|LEICESTERSHIRE|I-ORG       |O         |
|TAKE          |O           |O         |
|OVER          |O           |O         |
|AT            |O           |O         |
|TOP           |O           |O         |
|AFTER         |O           |O         |
|INNINGS       |O           |O         |
|VICTORY       |O           |O         |
|.             |O           |O         |
|LONDON        |I-LOC       |O         |
|1996-08-30    |O           |O         |
|West          |I-MISC      |I-PER     |
|Indian        |I-MISC      |O         |
|all-rounder   |O           |O         |
|Phil          |I-PER       |O         |
|Simmons       |I-PER       |O         |
|took          |O           |O         |
|four          |O           |O         |
+--------------+------------+----------+
only showing top

In [19]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *

from sparknlp.pretrained import PretrainedPipeline
import sparknlp

# Start Spark Session with Spark NLP
spark = sparknlp.start()

# Download a pre-trained pipeline
pipeline = PretrainedPipeline('explain_document_dl', lang='en')

# Testing Data Set of Text: Top intro page from Wikipedia about Neural Networks.
text = """
An ANN is based on a collection of connected units or nodes called artificial neurons, 
which loosely model the neurons in a biological brain. Each connection, like the synapses in a biological brain, 
can transmit a signal to other neurons. An artificial neuron that receives a signal then processes it and can signal neurons connected to it. 
The "signal" at a connection is a real number, and the output of each neuron is computed by some non-linear function of the sum of its inputs. 
The connections are called edges. Neurons and edges typically have a weight that adjusts as learning proceeds. The weight increases or decreases the strength of the signal at a connection. 
Neurons may have a threshold such that a signal is sent only if the aggregate signal crosses that threshold. 
Typically, neurons are aggregated into layers. Different layers may perform different transformations on their inputs. 
Signals travel from the first layer (the input layer), to the last layer (the output layer), possibly after traversing the layers multiple times.
"""

# Annotate your testing dataset
result = pipeline.annotate(text)



explain_document_dl download started this may take some time.
Approx size to download 168.4 MB
[OK!]


In [20]:
list(result.keys())

['entities',
 'stem',
 'checked',
 'lemma',
 'document',
 'pos',
 'token',
 'ner',
 'embeddings',
 'sentence']

In [21]:
result['entities']

['ANN']