In [1]:
#!pip install spark-nlp
#!pip install fastparquet 

In [2]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline

import pandas as pd

In [3]:
"""
from pyspark.sql import SparkSession

# start spark session configured for spark nlp
spark = SparkSession.builder \
     .master('local[*]') \
     .appName('Spark NLP') \
     .config('spark.jars.packages') \
     .getOrCreate()

spark
"""
print()




In [4]:
spark = sparknlp.start()

spark

In [5]:
sc = spark
sqlContext = SQLContext(spark)
sqlContext

<pyspark.sql.context.SQLContext at 0x7f214588f860>

In [6]:
#sc = SparkContext('local', 'PySPARK LDA Example')
#sqlContext = SQLContext(sc)

# Load data

## pyspark

In [7]:
df_spark = sqlContext.read.parquet("newsgroup_20_data.parquet")
print(df_spark.count())
#df_spark.head(2)

18846


In [8]:
d = df_spark.head(1)
d = d[0]

print(d.asDict()['category'])
#print(d.asDict()['news'])

rec.sport.hockey


In [9]:
(trainingData, testData) = df_spark.randomSplit([0.7, 0.3], seed = 100)

# NLP Pipeline using Spark NLP

In [10]:
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer,IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator# convert text column to spark nlp document

In [11]:
col_input = "news"
col_label = "category"
col_nlp = 'col_nlp'

In [12]:
import pyspark.sql.functions as F
from pyspark.sql.types import *
import re
from utils import CUSTOM_STOP_WORDS

In [13]:
def text_cleaner(sentence):
    
    # clean the punctuations
    punc_re = r'[^a-zA-Z0-9 &]'
    sentence = re.sub(punc_re, ' ', sentence)
    
    # tokens
    arr = sentence.split()
    
    # remove white spaces
    arr = [word.strip() for word in arr if word.isalpha() and len(word)>=4]
    
    arr = " ".join(arr)
    return arr

In [14]:
data = trainingData.limit(1000)
#data = trainingData

udf_text_cleaner = F.udf(text_cleaner, StringType())
#udf_text_cleaner = F.udf(text_cleaner, ArrayType(elementType=StringType()))

data_train_clean = data.withColumn(col_nlp, udf_text_cleaner(col_input))

print(data_train_clean.count())
data_train_clean.limit(2).show()

1000
+--------------------+------------+--------------------+
|                news|    category|             col_nlp|
+--------------------+------------+--------------------+
| agate!ames!purdu...|misc.forsale|agate ames purdue...|
| agate!iat.holone...|   rec.autos|agate holonet psi...|
+--------------------+------------+--------------------+



In [18]:
documentAssembler = DocumentAssembler() \
     .setInputCol(col_input) \
     .setOutputCol('document')

tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel()\
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(CUSTOM_STOP_WORDS)

# finisher converts tokens to human-readable output
finisher = Finisher() \
     .setInputCols(['normalized']) \
     .setCleanAnnotations(False)

In [19]:
pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])

pipeline2 = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           finisher
           #lemmatizer,
           #stopwords_cleaner
     ])

In [20]:
pipeline_model = pipeline2.fit(data_train_clean)

train_temp = pipeline_model.transform(data_train_clean)
#pp_test_data = pipeline_model.transform(data_test_clean)

pp_train_data = train_temp.select('category','normalized',col_input)

print(pp_train_data.count())
pp_train_data.limit(2).show()

1000
+------------+--------------------+--------------------+
|    category|          normalized|                news|
+------------+--------------------+--------------------+
|misc.forsale|[[token, 1, 51, a...| agate!ames!purdu...|
|   rec.autos|[[token, 1, 52, a...| agate!iat.holone...|
+------------+--------------------+--------------------+



In [21]:
df = pp_train_data.limit(2).toPandas()

In [22]:
df["normalized"].values

array([list([Row(annotatorType='token', begin=1, end=51, result='agateamespurduementorccpurdueedusageccpurdueedukari', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=64, end=70, result='subject', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=77, end=79, result='meg', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=81, end=87, result='seagate', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=89, end=91, result='ide', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=93, end=96, result='hard', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=98, end=102, result='drive', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=108, end=111, result='from', metadata={'sentence': '0'}, embeddings=[]), Row(annotatorType='token', begin=114, end=132, result='karisageccpurdueedu', metadata={'sentence': '0

In [None]:
error

In [25]:
from pyspark.ml.feature import CountVectorizer , IDF

In [33]:
def type_changer(sentence):
    return sentence.split(" ")

udf_type_changer = F.udf(type_changer, ArrayType(elementType=StringType()))
data_arr = data_train_clean.withColumn("col_nlp_arr", udf_type_changer(col_nlp))

# TF
cv = CountVectorizer(inputCol="col_nlp_arr", outputCol="raw_features", vocabSize=5000, minDF=10.0)
cvmodel = cv.fit(data_arr)
result_cv = cvmodel.transform(data_arr)

# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)

pp_train_data = result_tfidf.select('category', col_input, "features")

print(type(pp_train_data))
pp_train_data.limit(2).show()

<class 'pyspark.sql.dataframe.DataFrame'>
+------------+--------------------+--------------------+
|    category|                news|            features|
+------------+--------------------+--------------------+
|misc.forsale| agate!ames!purdu...|(2193,[1,5,7,8,13...|
|   rec.autos| agate!iat.holone...|(2193,[3,4,5,7,8,...|
+------------+--------------------+--------------------+



In [35]:
#d = pp_train_data[['news','features']].map(list)

## LDA Model

In [37]:
from pyspark.ml.clustering import LDA

In [39]:
numTopics = 20 # number of topics
 
lda = LDA(k=numTopics, seed = 1, optimizer="online", optimizeDocConcentration=True,
          maxIter = 10,           # number of iterations
          learningDecay = 0.51,   # kappa, learning rate
          learningOffset = 64.0,  # tau_0, larger values downweigh early iterations
          subsamplingRate = 0.05, # mini batch fraction 
          )
 
model = lda.fit(pp_train_data.select("features"))
 
ll = model.logLikelihood(pp_train_data)
lp = model.logPerplexity(pp_train_data)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -2377656.2476316197
The upper bound on perplexity: 7.38978733968984


In [None]:
lperplexity = model.logPerplexity(pp_test_data)
print(lperplexity)

### topic insights

In [None]:
# Describe topics.
N = 3
topics = model.describeTopics(N)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

In [None]:
# check the first topic
model.describeTopics().first()

In [None]:
"""

# show head()
result_tfidf.show()


# select columns
df_model=result_tfidf.select('index','list_of_words','features')

"""
print()