# Document Normalizer annotator notebook

# Set up Colab environment

In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install -q pyspark==2.4.6
! pip install -q spark-nlp

# Start Spark NLP session

In [8]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *

def start():
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[4]") \
        .config("spark.driver.memory", "8G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.7.0") \
        .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
        .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    return builder.getOrCreate()

spark = start()

# Document Normalizer annotator overview

In [2]:
# The DocumentNormalizer is an annotator that can be used after the DocumentAssembler to narmalize documents once that they have been processed and indexed .
# It takes in input annotated documents of type Array[AnnotatorType](DOCUMENT) and gives as output annotated document of type AnnotatorType.DOCUMENT .
#
# Parameters are:
# - inputCol: input column name string which targets a column of type Array(AnnotatorType.DOCUMENT).
# - outputCol: output column name string which targets a column of type AnnotatorType.DOCUMENT.
# - action: action string to perform applying regex patterns, i.e. (clean | extract). Default is "clean".
# - cleanupPatterns: normalization regex patterns which match will be removed from document. Default is "<[^>]*>" (e.g., it removes all HTML tags).
# - replacement: replacement string to apply when regexes match. Default is " ".
# - lowercase: whether to convert strings to lowercase. Default is False.
# - removalPolicy: removalPolicy to remove patterns from text with a given policy. Valid policy values are: "all", "pretty_all", "first", "pretty_first". Defaults is "pretty_all".
# - encoding: file encoding to apply on normalized documents. Supported encodings are: UTF_8, UTF_16, US_ASCII, ISO-8859-1, UTF-16BE, UTF-16LE. Default is "UTF-8".


documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

inpuColName = "document"
outputColName = "normalizedDocument"

action = "clean"
cleanUpPatterns = ["<[^>]*>"]
replacement = " "
removalPolicy = "pretty_all"
encoding = "UTF-8"

documentNormalizer = DocumentNormalizer() \
    .setInputCols(inpuColName) \
    .setOutputCol(outputColName) \
    .setAction(action) \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(replacement) \
    .setPolicy(removalPolicy) \
    .setLowercase(True) \
    .setEncoding(encoding)

# Data loading

In [3]:
path = "html-docs"

data = spark.sparkContext.wholeTextFiles("html-docs")
df = data.toDF(schema=["filename", "text"]).select("text")

df.show()

+--------------------+
|                text|
+--------------------+
|<div class='w3-co...|
|<!DOCTYPE html>
<...|
|<span style="font...|
+--------------------+



# Example 1: remove all the tags from HTML text files

In [10]:
# Once data is loaded we can process the textual document applying a pipeline that normalizes the document right after the DocumentAssembler.
# For instance, let's imagine we are loading some HTML pages in our DataFrame and we want to remove all the tags in it:

documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

cleanUpPatterns = ["<[^>]*>"]

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(" ") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(df)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.select("normalizedDocument").show(10)

+--------------------+
|  normalizedDocument|
+--------------------+
|[[document, 0, 67...|
|[[document, 0, 17...|
|[[document, 0, 31...|
+--------------------+



# Example 2: obfuscate PII such as emails in HTML content

In [15]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

action = "clean"
patterns = ["([^.@\\s]+)(\\.[^.@\\s]+)*@([^.@\\s]+\\.)+([^.@\\s]+)"]
replacement = "***OBFUSCATED PII***"

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(replacement) \
    .setPolicy("pretty_all") \
    .setLowercase(True)

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(df)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.select("normalizedDocument").show(10, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Example 3: obfuscate PII such as ages in HTML content

In [17]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

action = "clean"
patterns = ["\\d+(?=[\\s]?year)", "(aged)[\\s]?\\d+"]
replacement = "***OBFUSCATED PII***"

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction(action) \
    .setPatterns(patterns) \
    .setReplacement(replacement) \
    .setPolicy("pretty_all") \
    .setLowercase(True)

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(df)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.select("normalizedDocument").show(10, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Example 4: extract XML name tag contents

In [20]:
# data loading
data = spark.sparkContext.wholeTextFiles("xml-docs")
df = data.toDF(schema=["filename", "text"]).select("text")
df.show()

+--------------------+
|                text|
+--------------------+
|<?xml version="1....|
+--------------------+



In [21]:
documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

action = "extract"

tag = "name"
patterns = [tag]

documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction(action) \
    .setPatterns(patterns) \
    .setReplacement("") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")

regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(df)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.select("normalizedDocument").show(10, False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|normalizedDocument                                                                                                                                                                                                                                                                                                        