0. Colab setup

In [1]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.5.1

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)
[K     |████████████████████████████████| 215.7MB 60kB/s 
[K     |████████████████████████████████| 204kB 46.6MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 122kB 2.8MB/s 
[?25h

1. Start a spark session

In [2]:
import sparknlp
spark = sparknlp.start()

print("Spark NLP version")
sparknlp.version()
print("Apache Spark version")
spark.version

Spark NLP version
Apache Spark version


'2.4.4'

# QUICK START

NER

In [34]:
from sparknlp.pretrained import PretrainedPipeline 
from sparknlp.annotator import *
from pyspark.ml import Pipeline

In [10]:
pipeline = PretrainedPipeline('recognize_entities_dl', 'en')

recognize_entities_dl download started this may take some time.
Approx size to download 159 MB
[OK!]


In [11]:
result = pipeline.annotate('Is SparkNLP the best NLP library?') 

In [12]:
print(result)

{'entities': ['NLP'], 'document': ['Is SparkNLP the best NLP library?'], 'token': ['Is', 'SparkNLP', 'the', 'best', 'NLP', 'library', '?'], 'ner': ['O', 'O', 'O', 'O', 'B-ORG', 'O', 'O'], 'embeddings': ['Is', 'SparkNLP', 'the', 'best', 'NLP', 'library', '?'], 'sentence': ['Is SparkNLP the best NLP library?']}


Sentiment Analysis

In [13]:
pipeline = PretrainedPipeline('analyze_sentiment', 'en')

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [15]:
pipeline.annotate('Is SparkNLP the best NLP library?')

{'checked': ['Is', 'SparkNLP', 'the', 'best', 'NLP', 'library', '?'],
 'document': ['Is SparkNLP the best NLP library?'],
 'sentence': ['Is SparkNLP the best NLP library?'],
 'sentiment': ['negative'],
 'token': ['Is', 'SparkNLP', 'the', 'best', 'NLP', 'library', '?']}

# Text CLassification

Now let’s see how this can be done in Spark NLP using Annotators and Transformers. Assume that we have the following steps that need to be applied one by one on a data frame.

Split text into sentences

Tokenize

Normalize

Get word embeddings


In [29]:
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
! wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv
trainDataset = spark.read \
      .option("header", True) \
      .csv("news_category_train.csv")
trainDataset.show(10, truncate=50)

--2020-07-15 16:35:22--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24032125 (23M) [text/plain]
Saving to: ‘news_category_train.csv’


2020-07-15 16:35:26 (40.5 MB/s) - ‘news_category_train.csv’ saved [24032125/24032125]

--2020-07-15 16:35:27--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Public/data/news_category_test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sen

In [30]:
from pyspark.sql.functions import col
trainDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|   World|30000|
|Sci/Tech|30000|
|  Sports|30000|
|Business|30000|
+--------+-----+



In [31]:
testDataset = spark.read \
      .option("header", True) \
      .csv("news_category_test.csv")
testDataset.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------+-----+
|category|count|
+--------+-----+
|   World| 1900|
|  Sports| 1900|
|Sci/Tech| 1900|
|Business| 1900|
+--------+-----+



In [32]:
# document assembler
document = DocumentAssembler()\
    .setInputCol("description")\
    .setOutputCol("document")

In [35]:
# downloading pretrained embeddings
use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [36]:
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)

use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

In [37]:
use_pipelineModel = use_clf_pipeline.fit(trainDataset)

In [38]:
preds = use_pipelineModel.transform(testDataset)
preds.show(10)

+--------+--------------------+--------------------+--------------------+--------------------+
|category|         description|            document| sentence_embeddings|               class|
+--------+--------------------+--------------------+--------------------+--------------------+
|Business|Unions representi...|[[document, 0, 12...|[[sentence_embedd...|[[category, 0, 12...|
|Sci/Tech| TORONTO, Canada ...|[[document, 0, 22...|[[sentence_embedd...|[[category, 0, 22...|
|Sci/Tech| A company founde...|[[document, 0, 20...|[[sentence_embedd...|[[category, 0, 20...|
|Sci/Tech| It's barely dawn...|[[document, 0, 26...|[[sentence_embedd...|[[category, 0, 26...|
|Sci/Tech| Southern Califor...|[[document, 0, 17...|[[sentence_embedd...|[[category, 0, 17...|
|Sci/Tech|"The British Depa...|[[document, 0, 10...|[[sentence_embedd...|[[category, 0, 10...|
|Sci/Tech|"confessed author...|[[document, 0, 34...|[[sentence_embedd...|[[category, 0, 34...|
|Sci/Tech|\\FOAF/LOAF  and ...|[[document, 0, 70..

In [39]:
from sklearn.metrics import classification_report

df = preds.select('description','category','class.result').toPandas()

df['pred_category'] = df['result'].apply(lambda x: x[0])

print(classification_report(df['category'], df['pred_category']))

              precision    recall  f1-score   support

    Business       0.85      0.84      0.84      1900
    Sci/Tech       0.84      0.89      0.87      1900
      Sports       0.96      0.98      0.97      1900
       World       0.93      0.86      0.89      1900

    accuracy                           0.89      7600
   macro avg       0.89      0.89      0.89      7600
weighted avg       0.89      0.89      0.89      7600

