# Document Normalizer annotator notebook

# Set up Colab environment

In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install -q pyspark==2.4.7
! pip install -q spark-nlp

# Start Spark NLP session

In [None]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *

def start():
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[4]") \
        .config("spark.driver.memory", "8G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.7.1") \
        .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
        .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    return builder.getOrCreate()

spark = start()

In [None]:
spark

# Regex Tokenizer annotator

In [None]:
from pyspark.ml import Pipeline
from pyspark.sql.types import StringType
from sparknlp.base import *
from sparknlp.annotator import *
import sparknlp

content = "1. T1-T2 DATE**[12/24/13] $1.99 () (10/12), ph+ 90%"
pattern = "\\s+|(?=[-.:;*+,$&%\\[\\]])|(?<=[-.:;*+,$&%\\[\\]])"

df = spark.createDataFrame([content], StringType()).withColumnRenamed("value", "text")

df.show()

documentAssembler = DocumentAssembler() \
    .setInputCol('text') \
    .setOutputCol('document')

sentenceDetector = SentenceDetector() \
      .setInputCols(["document"]) \
      .setOutputCol("sentence")

regexTokenizer = RegexTokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("regexToken") \
      .setPattern(pattern) \
      .setPositionalMask(False)

docPatternRemoverPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        sentenceDetector,
        regexTokenizer])

ds = docPatternRemoverPipeline.fit(df).transform(df)

ds.show(10, False)