In [1]:
#!conda install -c johnsnowlabs spark-nlp
#!y
#!pip install spark-nlp

In [2]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()
role

'arn:aws:iam::120286446822:role/sagemaker-ml-pipeline'

In [3]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.sql import SparkSession, SQLContext


In [4]:
spark = sparknlp.start()
spark

In [5]:
sqlContext = SQLContext(spark)
sqlContext

<pyspark.sql.context.SQLContext at 0x7f2b08290d30>

# Load Data

In [6]:
text_data = [['Peter is  100% Good person & living in Germany!'],
             ['Paula is AlsO a good person!'],
             ['She lives, in London. Living from long time']]

text_data1 = [[" senetence- 0 is @ having       Few Text!"], 
            ["sentence-1 this iS     cuda PRogram."],
            ["sent-2 program and &            RUNNING on GPu"],
            ["sent-3 checkinh   the STemming and @tokenS"]]

In [7]:
df_spark = spark.createDataFrame(text_data).toDF('text_raw')

print(df_spark.count())
df_spark.show(truncate=False)

3
+-----------------------------------------------+
|text_raw                                       |
+-----------------------------------------------+
|Peter is  100% Good person & living in Germany!|
|Paula is AlsO a good person!                   |
|She lives, in London. Living from long time    |
+-----------------------------------------------+



In [8]:
col_input = 'text'

In [9]:
#"""
path = "/home/ec2-user/SageMaker/aws-ml/spark/newsgroup_20_data.parquet"
df_spark = sqlContext.read.parquet(path)
print(df_spark.count())
df_spark.limit(2).show(truncate=True)

print()

18846
+--------------------+--------------------+
|                news|            category|
+--------------------+--------------------+
|From: Mamatha Dev...|    rec.sport.hockey|
|From: mblawson@mi...|comp.sys.ibm.pc.h...|
+--------------------+--------------------+




In [10]:
data_in = df_spark.withColumn(col_input, df_spark["news"])
#data_in = df_spark.withColumn(col_input, df_spark["text_raw"])

print(data_in.count())
data_in.limit(2).show(truncate=True)

18846
+--------------------+--------------------+--------------------+
|                news|            category|                text|
+--------------------+--------------------+--------------------+
|From: Mamatha Dev...|    rec.sport.hockey|From: Mamatha Dev...|
|From: mblawson@mi...|comp.sys.ibm.pc.h...|From: mblawson@mi...|
+--------------------+--------------------+--------------------+



# pre-process pipeline

In [11]:
from pyspark.sql.functions import udf, col, size
from pyspark.sql.types import IntegerType

In [12]:
# https://medium.com/trustyou-engineering/topic-modelling-with-pyspark-and-spark-nlp-a99d063f1a6e

In [13]:
col_input = "text"

In [14]:
documentAssembler = DocumentAssembler()\
.setInputCol(col_input)\
.setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True) \
     .setCleanupPatterns([r'[^a-zA-Z0-9 &]'])

stemmer = Stemmer() \
    .setInputCols(["normalized"]) \
    .setOutputCol("stem")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("stem")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)\
      #.setStopWords(["no", "without"])
      
"""
from sparknlp.annotator import PerceptronModel
pos_tagger = PerceptronModel.pretrained('pos_anc') \
     .setInputCols(['document', 'lemmatized']) \
     .setOutputCol('pos')
"""
tokenassembler = TokenAssembler()\
    .setInputCols(["document", "cleanTokens"]) \
    .setOutputCol("clean_text")


finisher = Finisher() \
    .setInputCols(["cleanTokens"]) \
    .setIncludeMetadata(False) # set to False to remove metadata

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 normalizer,
 stemmer,
 stopwords_cleaner,
 #tokenassembler,
 finisher
 ])

empty_df = spark.createDataFrame([['']]).toDF(col_input)
pipelineModel = nlpPipeline.fit(empty_df)

In [15]:
# input data : data_in

data_arr = pipelineModel.transform(data_in)

data_arr = data_arr.withColumnRenamed("finished_cleanTokens", "col_nlp_arr")
data_arr = data_arr.select("category", "col_nlp_arr")

# filter out less data records
#size_ = udf(lambda xs: len(xs), IntegerType())
#data_arr = data_arr.filter( size_(data_arr.col_nlp_arr)  >= 40)


new_data = data_arr.where(size(col("col_nlp_arr")) >= 50)


print(data_arr.count())
data_arr.limit(4).show(truncate=True)

18846
+--------------------+--------------------+
|            category|         col_nlp_arr|
+--------------------+--------------------+
|    rec.sport.hockey|[mamatha, devinen...|
|comp.sys.ibm.pc.h...|[mblawsonmidwayec...|
|talk.politics.mid...|[hilmierdsvsus, h...|
|comp.sys.ibm.pc.h...|[guydaustinibmcom...|
+--------------------+--------------------+



In [16]:
print("hello")

hello


In [17]:
del data_in, df_spark

## 3. tfidf
coverting text data to ML features i.e tfidf matrix

In [18]:
col_nlp_arr = "col_nlp_arr"

In [19]:
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA

In [None]:
# TF

#data_arr = data_arr.where(size(col("col_nlp_arr")) >= 20)

cv = CountVectorizer(inputCol=col_nlp_arr, outputCol="cv_features", minDF=5.0)
cvmodel = cv.fit(data_arr)
result_cv = cvmodel.transform(data_arr)

# IDF
idf = IDF(inputCol="cv_features", outputCol="idf_features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)


#data_pp = result_tfidf.select('category', "news", "features")
data_pp = result_tfidf

print(type(data_pp))
print(data_pp.count())
data_pp.limit(4).show(truncate=True)

In [None]:
data_arr.limit(4).toPandas()

In [None]:
len(cvmodel.vocabulary)

In [None]:
#del data_in, data_arr, df_spark

In [None]:
error

## LDA model

In [None]:
from pyspark.ml.clustering import LDA

In [None]:
num_topics = 2
max_iter = 10

lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='idf_features')

lda_model = lda.fit(data_pp)
lda_model

In [None]:
from pyspark.sql import functions as F
import pyspark.sql.types as T

In [None]:
# input : idfModel, 
# retrieve topic words 
vocab = cvmodel.vocabulary

def get_words(token_list):
    return [vocab[token_id] for token_id in token_list]

# converts word ids (the actual output for a topic by a topic model) into words
udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

In [None]:
#  output words for each modelled topic with LDA model function describeTopics

# output of lda_model.describeTopics : topic, termIndices, termWeights

num_top_words = 7

topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))

topics.select('topic', 'topicWords').show(truncate=100)

In [None]:
lda_model.describeTopics(num_top_words).show(truncate=100)