In [1]:
#!pip install spark-nlp
#!pip install fastparquet 

In [2]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline

import pandas as pd

In [4]:
from pyspark.sql import SparkSession
# start spark session configured for spark nlp
spark = SparkSession.builder \
     .master('local[*]') \
     .appName('Spark NLP') \
     .config('spark.jars.packages') \
     .getOrCreate()

spark

Exception: Java gateway process exited before sending its port number

In [None]:
sqlContext = SQLContext(spark)

In [None]:
#sc = SparkContext('local', 'PySPARK LDA Example')
#sqlContext = SQLContext(sc)

In [None]:
# check if spark context is defined
#print(sc.version)
print(spark.version)

# Load data

## pyspark

In [None]:
df_spark = sqlContext.read.parquet("newsgroup_20_data.parquet")
print(df_spark.count())
#df_spark.head(2)

In [None]:
d = df_spark.head(1)
d = d[0]

print(d.asDict()['category'])
#print(d.asDict()['news'])

In [None]:
(trainingData, testData) = df_spark.randomSplit([0.7, 0.3], seed = 100)

# NLP Pipeline using Spark NLP

In [None]:
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer,IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator# convert text column to spark nlp document

In [None]:
col_input = "news"
col_label = "category"
col_nlp = 'col_nlp'

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import re
from utils import CUSTOM_STOP_WORDS

In [None]:
def text_cleaner(sentence):
    
    # clean the punctuations
    punc_re = r'[^a-zA-Z0-9 &]'
    sentence = re.sub(punc_re, ' ', sentence)
    
    # tokens
    arr = sentence.split()
    
    # remove white spaces
    arr = [word.strip() for word in arr if word.isalpha() and len(word)>=4]
    
    arr = " ".join(arr)
    return arr

In [None]:
data = trainingData.limit(1000)
#data = trainingData

udf_text_cleaner = F.udf(text_cleaner, StringType())
#udf_text_cleaner = F.udf(text_cleaner, ArrayType(elementType=StringType()))

data_train_clean = data.withColumn(col_nlp, udf_text_cleaner(col_input))

print(data_train_clean.count())
data_train_clean.limit(2).show()

In [None]:
documentAssembler = DocumentAssembler() \
     .setInputCol(col_input) \
     .setOutputCol('document')
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')
# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)
# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel()\
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')
stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(CUSTOM_STOP_WORDS)
# finisher converts tokens to human-readable output
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)

In [None]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

pipeline2 = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer
     ])

In [None]:
equifax = pipeline2.fit(data_train_clean).transform(data_train_clean)


In [None]:
error

In [None]:
input_col  = col_nlp
output_col = "document"

document_assembler = DocumentAssembler() \
    .setInputCol(input_col) \
    .setOutputCol("document")

# convert document to array of tokens
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
 
# clean tokens 
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

# remove stopwords
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

# stems tokens to bring it to root form
stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

# Convert custom document structure to array of tokens.
finisher = Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

# To generate Term Frequency
hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=1000)

# To generate Inverse Document Frequency
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)


In [None]:
nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
            idf
           ])

In [None]:
data = trainingData
pipeline_model = nlp_pipeline.fit(data)
print("nlp_pipeline training completed..!")

In [None]:
type(pipeline_model)

In [None]:
pp_train_data = pipeline_model.transform(trainingData)
pp_test_data = pipeline_model.transform(testData)

In [None]:
type(pp_train_data)

In [None]:
df = pp_train_data.select('category','token','features').limit(10)
df.show()

In [None]:
df[['index','features']].map(list)

In [None]:
error

## LDA Model

In [None]:
from pyspark.ml.clustering import LDA

In [None]:
numTopics = 20 # number of topics
 
lda = LDA(k=numTopics, seed = 1, optimizer="online", optimizeDocConcentration=True,
          maxIter = 10,           # number of iterations
          learningDecay = 0.51,   # kappa, learning rate
          learningOffset = 64.0,  # tau_0, larger values downweigh early iterations
          subsamplingRate = 0.05, # mini batch fraction 
          )
 
model = lda.fit(pp_train_data)
 
ll = model.logLikelihood(pp_train_data)
lp = model.logPerplexity(pp_train_data)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

In [None]:
lperplexity = model.logPerplexity(pp_test_data)
print(lperplexity)

### topic insights

In [None]:
# Describe topics.
N = 3
topics = model.describeTopics(N)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

In [None]:
# check the first topic
model.describeTopics().first()

In [None]:
"""

# show head()
result_tfidf.show()


# select columns
df_model=result_tfidf.select('index','list_of_words','features')

"""
print()