In [1]:
#!pip install advertools
#!pip install hebrew_tokenizer
! pip install -q pyspark==3.2.0 spark-nlp
#!cd ~/cache_pretrained && ls -l

[K     |████████████████████████████████| 281.3 MB 35 kB/s 
[K     |████████████████████████████████| 145 kB 13.6 MB/s 
[K     |████████████████████████████████| 198 kB 28.4 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
import sparknlp
spark = sparknlp.start(spark32=True)
from sparknlp.base import *
from sparknlp.annotator import *
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 3.4.4
Apache Spark version: 3.2.0


In [3]:
import codecs
#import hebrew_tokenizer as ht
import re
import string
import pandas as pd
from pyspark.ml import Pipeline
import pyspark.sql.functions as F

In [4]:
data = list(codecs.open('hebrew_text.tsv', 'r', 'utf-8').readlines())
data_df =pd.DataFrame({'text':data})
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

spark_dff = sqlContext.createDataFrame(data_df)



In [5]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")
#----------------------------------------------------------
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")
#----------------------------------------------------------
stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")
#----------------------------------------------------------
lemmatizer = LemmatizerModel.pretrained("lemma", "he") \
        .setInputCols(["token"]) \
        .setOutputCol("lemma")
#----------------------------------------------------------
# we can also get the list of stopwords 
stopwords_cleaner_he = StopWordsCleaner.pretrained('stopwords_he','he')\
        .setInputCols("stem")\
        .setOutputCol("stopwords")\
        .setCaseSensitive(False)

token_assembler = TokenAssembler() \
    .setInputCols(["document", "stopwords"]) \
    .setOutputCol("cleanStopwords")
#----------------------------------------------------------
pos = PerceptronModel.pretrained("pos_ud_htb", "he") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("pos")
#----------------------------------------------------------
# applying POS chunker to find a custom pattern
chunker = Chunker()\
    .setInputCols(["document", "pos"])\
    .setOutputCol("chunk")\
    .setRegexParsers(["<NOUN>", "<ADJ>","<ADV>","<AUX>","<INTJ>","<PROPN>", "<VERB>","<None>"]) 
#chunker.extractParamMap()
#----------------------------------------------------------
spell_checker_norvig = NorvigSweetingModel.pretrained('spellcheck_norvig')\
        .setInputCols("token")\
        .setOutputCol("corrected")
#----------------------------------------------------------

lemma download started this may take some time.
Approximate size to download 169 KB
[OK!]
stopwords_he download started this may take some time.
Approximate size to download 2 KB
[OK!]
pos_ud_htb download started this may take some time.
Approximate size to download 2.6 MB
[OK!]
spellcheck_norvig download started this may take some time.
Approximate size to download 4.2 MB
[OK!]


In [6]:
nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    tokenizer,
    stemmer,
    lemmatizer,
    stopwords_cleaner_he,
    pos,
    chunker,
    token_assembler
 ])



empty_df = spark.createDataFrame([['']]).toDF("text")
pipelineModel = nlpPipeline.fit(empty_df)

In [7]:
result = pipelineModel.transform(spark_dff)
result.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|                stem|               lemma|           stopwords|                 pos|               chunk|      cleanStopwords|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|ממש כואב ..... אנ...|[{document, 0, 52...|[{token, 0, 2, ממ...|[{token, 0, 2, ממ...|[{token, 0, 2, ממ...|[{token, 0, 2, ממ...|[{pos, 0, 2, ADV,...|[{chunk, 27, 32, ...|[{document, 0, 44...|
|   איש יקר שלנו\t0\n|[{document, 0, 14...|[{token, 0, 2, אי...|[{token, 0, 2, אי...|[{token, 0, 2, אי...|[{token, 0, 2, אי...|[{pos, 0, 2, NOUN...|[{chunk, 0, 2, אי...|[{document, 0, 8,...|
|כל הכבוד והמון בה...|[{document, 0, 23...|[{

In [8]:
result.select('stem.result', 'cleanStopwords.result', 'chunk.result').show(30, truncate=100)

+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                                                              result|                                                                                              result|                                                                                              result|
+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                   [ממש, כואב, ....., אני, בוכה, עם, המשפחה, שלא, תד

In [9]:
from pyspark.sql import functions as F
result_df = result.select(F.explode(F.arrays_zip(result.token.result, 
                                                 result.stem.result, 
                                                 result.lemma.result, 
                                                 result.pos.result)).alias("cols")) \
                  .select(F.expr("cols['0']").alias("token"),
                          F.expr("cols['1']").alias("stem"),
                          F.expr("cols['2']").alias("lemma"),
                          F.expr("cols['3']").alias("pos")).toPandas()


In [10]:
result_df.head(30)

Unnamed: 0,token,stem,lemma,pos
0,ממש,ממש,ממש,ADV
1,כואב,כואב,כאב,VERB
2,.....,.....,.....,PUNCT
3,אני,אני,הוא,PRON
4,בוכה,בוכה,בוכה,VERB
5,עם,עם,עם,ADP
6,המשפחה,המשפחה,המשפחה,NOUN
7,שלא,שלא,שלא,ADV
8,תדעו,תדעו,תדעו,VERB
9,עוד,עוד,עוד,ADV
