<a href="https://colab.research.google.com/github/Tamrika/BigData/blob/main/Named_Entity_Recognition_with_Spark_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Named Entity Recognition with Spark NLP

## Installing Java 8(3.2.1)

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version

openjdk version "1.8.0_282"
OpenJDK Runtime Environment (build 1.8.0_282-8u282-b08-0ubuntu1~18.04-b08)
OpenJDK 64-Bit Server VM (build 25.282-b08, mixed mode)


## Installing pyspark and nlp(version specific)

In [None]:
# Install pyspark
! pip install --ignore-installed pyspark==2.4.4
# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.6.3

Processing /root/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471/pyspark-2.4.4-py2.py3-none-any.whl
Collecting py4j==0.10.7
  Using cached https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.4


Collecting spark-nlp==2.6.3
  Using cached https://files.pythonhosted.org/packages/84/84/3f15673db521fbc4e8e0ec3677a019ba1458b2cb70f0f7738c221511ef32/spark_nlp-2.6.3-py2.py3-none-any.whl
Installing collected packages: spark-nlp
Successfully installed spark-nlp-2.6.3


## Import the relevant packages(3.2.2) and retrieve Spark version

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
import sparknlp
spark = sparknlp.start()
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.6.3
Apache Spark version:  2.4.4


## Downloading the CoNLL dataset(3.2.3)

In [None]:
from pathlib import Path
import urllib.request
download_path = "./eng.train"
if not Path(download_path).is_file():
 print("File Not found will download it!")
 url = "https://raw.githubusercontent.com/patverga/torch-ner-nlp-from-scratch/master/data/conll2003/eng.train"
 urllib.request.urlretrieve(url, download_path)
else:
 print("File already present.")

File Not found will download it!


## Converting the CoNLL file to Spark data frame(3.2.4)

In [None]:
from sparknlp.training import CoNLL
training_data = CoNLL().readDataset(spark, './eng.train')
training_data.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|                 pos|               label|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|EU rejects German...|[[document, 0, 47...|[[document, 0, 47...|[[token, 0, 1, EU...|[[pos, 0, 1, NNP,...|[[named_entity, 0...|
|     Peter Blackburn|[[document, 0, 14...|[[document, 0, 14...|[[token, 0, 4, Pe...|[[pos, 0, 4, NNP,...|[[named_entity, 0...|
| BRUSSELS 1996-08-22|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 7, BR...|[[pos, 0, 7, NNP,...|[[named_entity, 0...|
|The European Comm...|[[document, 0, 18...|[[document, 0, 18...|[[token, 0, 2, Th...|[[pos, 0, 2, DT, ...|[[named_entity, 0...|
|Germany 's repres...|[[document, 0, 21...|[[document, 0, 21...|[[token, 0, 6, Ge...|[[pos, 0, 6, NNP,..

## Word embeddings through BERT,Spark NLP annotator(3.2.5)


In [None]:
bert = BertEmbeddings.pretrained('bert_base_cased', 'en') \
.setInputCols(["sentence",'token'])\
.setOutputCol("bert")\
.setCaseSensitive(False)

bert_base_cased download started this may take some time.
Approximate size to download 389.1 MB
[OK!]


## Configuring Tagger and Import the NerDLApproach() annotator,responsible for training the NER model(3.2.6)

In [None]:
nerTagger = NerDLApproach()\
.setInputCols(["sentence", "token", "bert"])\
.setLabelColumn("label")\
.setOutputCol("ner")\
.setMaxEpochs(1)\
.setRandomSeed(0)\
.setVerbose(1)\
.setValidationSplit(0.2)\
.setEvaluationLogExtended(True)\
.setEnableOutputLogs(True)\
.setIncludeConfidence(True)\
.setTestDataset("test_withEmbeds.parquet")

## Download testing data(3.2.7.1)

In [None]:
from pathlib import Path
import urllib.request
download_path = "./eng.testa"
if not Path(download_path).is_file():
 print("File Not found will download it!")
 url = "https://raw.githubusercontent.com/patverga/torch-ner-nlp-from-scratch/master/data/conll2003/eng.testa"
 urllib.request.urlretrieve(url, download_path)
else:
 print("File already present.")

File Not found will download it!


## Transforming and saving test data(3.2.7.1)

In [None]:
test_data = CoNLL().readDataset(spark, './eng.testa')
test_data = bert.transform(test_data)
test_data.write.parquet("test_withEmbeds.parquet")

## Append annotators bert and nerTagger in a pipeline and train the NER model with the training dataset(3.2.8)

In [None]:
%%time
ner_pipeline = Pipeline(stages = [bert, nerTagger])
ner_model = ner_pipeline.fit(training_data.limit(100))

KeyboardInterrupt: ignored

## Get predictions using test dataset.(3.2.9)

In [None]:
predictions = ner_model.tranform(test_data)
predictions.show()

In [None]:
predictions.select('token.result','label.result','ner.result').show(truncate=40)

### By printing schema, we can see the schema of the ‘predictions’ DataFrame

In [None]:
predictions.printSchema()

### Formatting to make it look better

In [None]:
import pyspark.sql.functions as F
predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ground_truth"),
        F.expr("cols['2']").alias("prediction")).show(truncate=False)

##Assignment 10

### Import necessary packages other than previously imported packages. Make sure Java, pyspark and nlp are installed

### Annotated (NER) the text given using a PretrainedPipeline(recognize_entities_dl) in SparkNLP

In [None]:
from sparknlp.pretrained import PretrainedPipeline
from pyspark import SparkContext,since
spark=sparknlp.start()
pipeline=PretrainedPipeline("recognize_entities_dl", lang="en")

recognize_entities_dl download started this may take some time.
Approx size to download 159 MB
[OK!]


### Store the text to be analyzed in a variable named text 

In [None]:
text="""The University of Illinois Springfield (UIS) is a public university in Springfield, Illinois, United
States. The university was established in 1969 as Sangamon State University by the Illinois
General Assembly and became a part of the University of Illinois system on July 1, 1995. As a
public liberal arts college, and the newest campus in the University of Illinois system, UIS is a
member of the Council of Public Liberal Arts Colleges. President: Timothy L. Killeen.
Chancellor: Karen M. Whitney. Location: Springfield, Illinois, United States."""

### Annotate 'text' and save it in 'result'

In [None]:
result = pipeline.annotate(text)

### Retrieve and print the output from 'result'

In [None]:
for data in list(zip(result['token'],result['ner'])):
  print(data)

('The', 'O')
('University', 'B-ORG')
('of', 'I-ORG')
('Illinois', 'I-ORG')
('Springfield', 'I-ORG')
('(', 'O')
('UIS', 'B-ORG')
(')', 'O')
('is', 'O')
('a', 'O')
('public', 'O')
('university', 'O')
('in', 'O')
('Springfield', 'B-LOC')
(',', 'O')
('Illinois', 'B-LOC')
(',', 'O')
('United', 'B-LOC')
('States', 'I-LOC')
('.', 'O')
('The', 'O')
('university', 'O')
('was', 'O')
('established', 'O')
('in', 'O')
('1969', 'O')
('as', 'O')
('Sangamon', 'B-ORG')
('State', 'I-ORG')
('University', 'I-ORG')
('by', 'O')
('the', 'O')
('Illinois', 'B-ORG')
('General', 'I-ORG')
('Assembly', 'I-ORG')
('and', 'O')
('became', 'O')
('a', 'O')
('part', 'O')
('of', 'O')
('the', 'O')
('University', 'B-ORG')
('of', 'I-ORG')
('Illinois', 'I-ORG')
('system', 'O')
('on', 'O')
('July', 'O')
('1', 'O')
(',', 'O')
('1995', 'O')
('.', 'O')
('As', 'O')
('a', 'O')
('public', 'O')
('liberal', 'O')
('arts', 'O')
('college', 'O')
(',', 'O')
('and', 'O')
('the', 'O')
('newest', 'O')
('campus', 'O')
('in', 'O')
('the', 'O')